In [3]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.1.1-py2.py3-none-any.whl (238 kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
file = pd.read_csv("polish_sentiment_dataset.csv")
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'description':'title'})

In [5]:
file_cleaned.head()

Unnamed: 0,title,length,rate
0,Polecam nie pierwszy i nie ostatni raz!,39.0,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,121.0,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,87.0,1.0
3,0,0.0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,48.0,1.0


In [6]:
file_cleaned.rate.value_counts()/len(file_cleaned)

INFO - 23:16:41: NumExpr defaulting to 4 threads.


 1.0    0.984766
-1.0    0.015233
 0.0    0.000002
Name: rate, dtype: float64

In [7]:
file_cleaned[file_cleaned.rate==0]

Unnamed: 0,title,length,rate
3,0,0.0,0.0


In [8]:
file_cleaned = file_cleaned[file_cleaned.rate!=0]

In [10]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)
    
    text = text.split()

    return text

In [11]:
file_cleaned.title = file_cleaned.title.apply(lambda x: text_to_word_list(x, unidecode))

In [12]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.title.str.len()>1]

In [23]:
sent = [row for row in file_model.title]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]
sent[1]

INFO - 07:42:07: collecting all words and their counts
INFO - 07:42:07: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 07:42:09: PROGRESS: at sentence #50000, processed 611860 words and 169257 word types
INFO - 07:42:11: PROGRESS: at sentence #100000, processed 1200585 words and 265494 word types
INFO - 07:42:12: PROGRESS: at sentence #150000, processed 1811398 words and 347364 word types
INFO - 07:42:13: PROGRESS: at sentence #200000, processed 2470241 words and 436886 word types
INFO - 07:42:14: PROGRESS: at sentence #250000, processed 3058090 words and 509631 word types
INFO - 07:42:16: PROGRESS: at sentence #300000, processed 3656884 words and 571393 word types
INFO - 07:42:17: PROGRESS: at sentence #350000, processed 4268981 words and 638483 word types
INFO - 07:42:18: PROGRESS: at sentence #400000, processed 5028018 words and 758229 word types
INFO - 07:42:20: PROGRESS: at sentence #450000, processed 5756675 words and 869938 word types
INFO - 07:42:21: PROGRE

['bardzo',
 'dobra',
 'komunikacja',
 'sms',
 'i',
 'telefoniczna',
 'zamowiony',
 'towar',
 'wyslany',
 'w',
 'terminie',
 'dobrze',
 'zabezpieczony',
 'polecam',
 'ten',
 'sklep']

In [25]:
sentences[1]

['bardzo',
 'dobra_komunikacja',
 'sms',
 'i',
 'telefoniczna',
 'zamowiony',
 'towar',
 'wyslany',
 'w',
 'terminie',
 'dobrze_zabezpieczony',
 'polecam',
 'ten',
 'sklep']

In [14]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 23:55:26: collecting all words and their counts
INFO - 23:55:26: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 23:55:32: PROGRESS: at sentence #50000, processed 523298 words, keeping 32119 word types
INFO - 23:55:38: PROGRESS: at sentence #100000, processed 1028955 words, keeping 46859 word types
INFO - 23:55:43: PROGRESS: at sentence #150000, processed 1551617 words, keeping 57721 word types
INFO - 23:55:48: PROGRESS: at sentence #200000, processed 2114830 words, keeping 69151 word types
INFO - 23:55:53: PROGRESS: at sentence #250000, processed 2617292 words, keeping 78351 word types
INFO - 23:55:59: PROGRESS: at sentence #300000, processed 3121588 words, keeping 85531 word types
INFO - 23:56:06: PROGRESS: at sentence #350000, processed 3640071 words, keeping 93225 word types
INFO - 23:56:11: PROGRESS: at sentence #400000, processed 4287249 words, keeping 107299 word types
INFO - 23:56:22: PROGRESS: at sentence #450000, processed 4905897 words, keepin

Time to build vocab: 2.28 mins


In [19]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 00:46:47: training model with 3 workers on 63643 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 00:46:48: EPOCH 1 - PROGRESS: at 1.61% examples, 25734 words/s, in_qsize 1, out_qsize 0
INFO - 00:46:49: EPOCH 1 - PROGRESS: at 3.15% examples, 26236 words/s, in_qsize 0, out_qsize 0
INFO - 00:46:50: EPOCH 1 - PROGRESS: at 4.28% examples, 23316 words/s, in_qsize 2, out_qsize 0
INFO - 00:46:51: EPOCH 1 - PROGRESS: at 5.92% examples, 23422 words/s, in_qsize 4, out_qsize 0
INFO - 00:46:52: EPOCH 1 - PROGRESS: at 7.68% examples, 22304 words/s, in_qsize 5, out_qsize 0
INFO - 00:46:53: EPOCH 1 - PROGRESS: at 9.89% examples, 22609 words/s, in_qsize 5, out_qsize 0
INFO - 00:46:54: EPOCH 1 - PROGRESS: at 11.69% examples, 22588 words/s, in_qsize 4, out_qsize 0
INFO - 00:46:56: EPOCH 1 - PROGRESS: at 13.75% examples, 22947 words/s, in_qsize 3, out_qsize 0
INFO - 00:46:57: EPOCH 1 - PROGRESS: at 15.94% examples, 23841 words/s, in_qsize 1, out_qsize 0
INFO - 

INFO - 00:48:13: EPOCH 2 - PROGRESS: at 23.06% examples, 23953 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:14: EPOCH 2 - PROGRESS: at 24.95% examples, 24049 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:15: EPOCH 2 - PROGRESS: at 26.48% examples, 24066 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:16: EPOCH 2 - PROGRESS: at 27.66% examples, 23958 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:17: EPOCH 2 - PROGRESS: at 29.33% examples, 23861 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:18: EPOCH 2 - PROGRESS: at 30.76% examples, 23766 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:19: EPOCH 2 - PROGRESS: at 32.29% examples, 23615 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:20: EPOCH 2 - PROGRESS: at 33.81% examples, 23447 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:21: EPOCH 2 - PROGRESS: at 35.62% examples, 23505 words/s, in_qsize 0, out_qsize 0
INFO - 00:48:22: EPOCH 2 - PROGRESS: at 37.48% examples, 23504 words/s, in_qsize 1, out_qsize 0
INFO - 00:48:23: EPOCH 2 - PROGRESS: at 

INFO - 00:49:40: EPOCH 3 - PROGRESS: at 50.95% examples, 21630 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:41: EPOCH 3 - PROGRESS: at 52.33% examples, 21512 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:42: EPOCH 3 - PROGRESS: at 53.68% examples, 21522 words/s, in_qsize 1, out_qsize 0
INFO - 00:49:43: EPOCH 3 - PROGRESS: at 54.94% examples, 21603 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:44: EPOCH 3 - PROGRESS: at 56.34% examples, 21648 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:45: EPOCH 3 - PROGRESS: at 57.67% examples, 21782 words/s, in_qsize 1, out_qsize 0
INFO - 00:49:46: EPOCH 3 - PROGRESS: at 58.98% examples, 21933 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:47: EPOCH 3 - PROGRESS: at 60.10% examples, 21970 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:48: EPOCH 3 - PROGRESS: at 61.28% examples, 22065 words/s, in_qsize 0, out_qsize 0
INFO - 00:49:49: EPOCH 3 - PROGRESS: at 62.44% examples, 22143 words/s, in_qsize 1, out_qsize 0
INFO - 00:49:50: EPOCH 3 - PROGRESS: at 

INFO - 00:51:07: EPOCH 4 - PROGRESS: at 77.90% examples, 26994 words/s, in_qsize 0, out_qsize 0
INFO - 00:51:08: EPOCH 4 - PROGRESS: at 79.74% examples, 27050 words/s, in_qsize 0, out_qsize 1
INFO - 00:51:09: EPOCH 4 - PROGRESS: at 81.37% examples, 27122 words/s, in_qsize 1, out_qsize 0
INFO - 00:51:10: EPOCH 4 - PROGRESS: at 83.04% examples, 27111 words/s, in_qsize 0, out_qsize 0
INFO - 00:51:11: EPOCH 4 - PROGRESS: at 84.55% examples, 27038 words/s, in_qsize 0, out_qsize 0
INFO - 00:51:12: EPOCH 4 - PROGRESS: at 85.58% examples, 27093 words/s, in_qsize 0, out_qsize 0
INFO - 00:51:13: EPOCH 4 - PROGRESS: at 87.01% examples, 27220 words/s, in_qsize 1, out_qsize 0
INFO - 00:51:14: EPOCH 4 - PROGRESS: at 87.95% examples, 27271 words/s, in_qsize 0, out_qsize 0
INFO - 00:51:15: EPOCH 4 - PROGRESS: at 89.08% examples, 27419 words/s, in_qsize 1, out_qsize 0
INFO - 00:51:16: EPOCH 4 - PROGRESS: at 90.65% examples, 27503 words/s, in_qsize 0, out_qsize 0
INFO - 00:51:17: EPOCH 4 - PROGRESS: at 

INFO - 00:52:31: EPOCH 6 - PROGRESS: at 9.56% examples, 26510 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:32: EPOCH 6 - PROGRESS: at 11.40% examples, 25887 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:33: EPOCH 6 - PROGRESS: at 13.16% examples, 25778 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:34: EPOCH 6 - PROGRESS: at 14.34% examples, 24669 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:35: EPOCH 6 - PROGRESS: at 15.94% examples, 24337 words/s, in_qsize 0, out_qsize 1
INFO - 00:52:36: EPOCH 6 - PROGRESS: at 17.84% examples, 24589 words/s, in_qsize 1, out_qsize 0
INFO - 00:52:37: EPOCH 6 - PROGRESS: at 19.67% examples, 24533 words/s, in_qsize 0, out_qsize 0
INFO - 00:52:38: EPOCH 6 - PROGRESS: at 21.15% examples, 24245 words/s, in_qsize 1, out_qsize 0
INFO - 00:52:39: EPOCH 6 - PROGRESS: at 23.06% examples, 24424 words/s, in_qsize 1, out_qsize 0
INFO - 00:52:40: EPOCH 6 - PROGRESS: at 24.95% examples, 24595 words/s, in_qsize 1, out_qsize 0
INFO - 00:52:41: EPOCH 6 - PROGRESS: at 2

INFO - 00:53:57: EPOCH 7 - PROGRESS: at 35.47% examples, 21263 words/s, in_qsize 0, out_qsize 1
INFO - 00:53:58: EPOCH 7 - PROGRESS: at 37.01% examples, 21195 words/s, in_qsize 0, out_qsize 1
INFO - 00:53:59: EPOCH 7 - PROGRESS: at 39.07% examples, 21374 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:00: EPOCH 7 - PROGRESS: at 41.02% examples, 21621 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:01: EPOCH 7 - PROGRESS: at 43.02% examples, 21799 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:02: EPOCH 7 - PROGRESS: at 44.90% examples, 21879 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:03: EPOCH 7 - PROGRESS: at 47.03% examples, 22149 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:04: EPOCH 7 - PROGRESS: at 48.92% examples, 22250 words/s, in_qsize 1, out_qsize 0
INFO - 00:54:05: EPOCH 7 - PROGRESS: at 50.02% examples, 21989 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:06: EPOCH 7 - PROGRESS: at 52.02% examples, 22105 words/s, in_qsize 0, out_qsize 0
INFO - 00:54:07: EPOCH 7 - PROGRESS: at 

INFO - 00:55:23: EPOCH 8 - PROGRESS: at 63.00% examples, 23745 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:24: EPOCH 8 - PROGRESS: at 64.00% examples, 23631 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:25: EPOCH 8 - PROGRESS: at 64.85% examples, 23614 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:26: EPOCH 8 - PROGRESS: at 66.62% examples, 23864 words/s, in_qsize 0, out_qsize 0
INFO - 00:55:27: EPOCH 8 - PROGRESS: at 68.15% examples, 23996 words/s, in_qsize 3, out_qsize 0
INFO - 00:55:28: EPOCH 8 - PROGRESS: at 69.61% examples, 24129 words/s, in_qsize 5, out_qsize 2
INFO - 00:55:29: EPOCH 8 - PROGRESS: at 71.67% examples, 24566 words/s, in_qsize 5, out_qsize 0
INFO - 00:55:30: EPOCH 8 - PROGRESS: at 73.27% examples, 24899 words/s, in_qsize 5, out_qsize 0
INFO - 00:55:31: EPOCH 8 - PROGRESS: at 74.86% examples, 25238 words/s, in_qsize 5, out_qsize 0
INFO - 00:55:32: EPOCH 8 - PROGRESS: at 75.72% examples, 25131 words/s, in_qsize 4, out_qsize 1
INFO - 00:55:33: EPOCH 8 - PROGRESS: at 

INFO - 00:56:46: EPOCH 10 - PROGRESS: at 80.39% examples, 49662 words/s, in_qsize 4, out_qsize 0
INFO - 00:56:47: EPOCH 10 - PROGRESS: at 83.55% examples, 49863 words/s, in_qsize 1, out_qsize 1
INFO - 00:56:48: EPOCH 10 - PROGRESS: at 86.26% examples, 50178 words/s, in_qsize 4, out_qsize 0
INFO - 00:56:49: EPOCH 10 - PROGRESS: at 88.16% examples, 50532 words/s, in_qsize 5, out_qsize 0
INFO - 00:56:50: EPOCH 10 - PROGRESS: at 90.65% examples, 50753 words/s, in_qsize 6, out_qsize 0
INFO - 00:56:51: EPOCH 10 - PROGRESS: at 93.95% examples, 50564 words/s, in_qsize 6, out_qsize 0
INFO - 00:56:52: EPOCH 10 - PROGRESS: at 97.27% examples, 50833 words/s, in_qsize 5, out_qsize 0
INFO - 00:56:53: worker thread finished; awaiting finish of 2 more threads
INFO - 00:56:53: worker thread finished; awaiting finish of 1 more threads
INFO - 00:56:53: worker thread finished; awaiting finish of 0 more threads
INFO - 00:56:53: EPOCH - 10 : training on 7487517 raw words (1788654 effective words) took 34.5s

INFO - 00:58:06: EPOCH 13 - PROGRESS: at 5.83% examples, 45455 words/s, in_qsize 5, out_qsize 0
INFO - 00:58:07: EPOCH 13 - PROGRESS: at 9.89% examples, 46086 words/s, in_qsize 3, out_qsize 0
INFO - 00:58:08: EPOCH 13 - PROGRESS: at 13.45% examples, 46730 words/s, in_qsize 1, out_qsize 0
INFO - 00:58:09: EPOCH 13 - PROGRESS: at 16.64% examples, 46551 words/s, in_qsize 3, out_qsize 1
INFO - 00:58:10: EPOCH 13 - PROGRESS: at 20.40% examples, 46097 words/s, in_qsize 5, out_qsize 0
INFO - 00:58:11: EPOCH 13 - PROGRESS: at 23.79% examples, 45793 words/s, in_qsize 6, out_qsize 0
INFO - 00:58:12: EPOCH 13 - PROGRESS: at 27.21% examples, 44838 words/s, in_qsize 5, out_qsize 0
INFO - 00:58:13: EPOCH 13 - PROGRESS: at 30.48% examples, 45521 words/s, in_qsize 5, out_qsize 0
INFO - 00:58:14: EPOCH 13 - PROGRESS: at 34.11% examples, 45975 words/s, in_qsize 4, out_qsize 0
INFO - 00:58:15: EPOCH 13 - PROGRESS: at 37.17% examples, 45591 words/s, in_qsize 6, out_qsize 0
INFO - 00:58:16: EPOCH 13 - PROG

INFO - 00:59:31: EPOCH 15 - PROGRESS: at 24.95% examples, 29348 words/s, in_qsize 0, out_qsize 0
INFO - 00:59:32: EPOCH 15 - PROGRESS: at 26.24% examples, 28655 words/s, in_qsize 3, out_qsize 0
INFO - 00:59:33: EPOCH 15 - PROGRESS: at 27.66% examples, 28504 words/s, in_qsize 2, out_qsize 0
INFO - 00:59:34: EPOCH 15 - PROGRESS: at 29.76% examples, 28575 words/s, in_qsize 0, out_qsize 1
INFO - 00:59:35: EPOCH 15 - PROGRESS: at 31.67% examples, 28438 words/s, in_qsize 1, out_qsize 0
INFO - 00:59:36: EPOCH 15 - PROGRESS: at 33.35% examples, 28033 words/s, in_qsize 4, out_qsize 3
INFO - 00:59:37: EPOCH 15 - PROGRESS: at 35.77% examples, 28347 words/s, in_qsize 3, out_qsize 0
INFO - 00:59:38: EPOCH 15 - PROGRESS: at 37.96% examples, 28287 words/s, in_qsize 2, out_qsize 0
INFO - 00:59:39: EPOCH 15 - PROGRESS: at 39.69% examples, 27906 words/s, in_qsize 4, out_qsize 0
INFO - 00:59:40: EPOCH 15 - PROGRESS: at 41.32% examples, 27689 words/s, in_qsize 6, out_qsize 0
INFO - 00:59:41: EPOCH 15 - PR

INFO - 01:00:53: EPOCH 17 - PROGRESS: at 47.17% examples, 46536 words/s, in_qsize 2, out_qsize 1
INFO - 01:00:55: EPOCH 17 - PROGRESS: at 51.11% examples, 46655 words/s, in_qsize 0, out_qsize 1
INFO - 01:00:56: EPOCH 17 - PROGRESS: at 54.45% examples, 47109 words/s, in_qsize 0, out_qsize 1
INFO - 01:00:57: EPOCH 17 - PROGRESS: at 57.42% examples, 47638 words/s, in_qsize 0, out_qsize 1
INFO - 01:00:58: EPOCH 17 - PROGRESS: at 59.55% examples, 47473 words/s, in_qsize 5, out_qsize 1
INFO - 01:00:59: EPOCH 17 - PROGRESS: at 62.12% examples, 48026 words/s, in_qsize 5, out_qsize 0
INFO - 01:01:00: EPOCH 17 - PROGRESS: at 64.76% examples, 48260 words/s, in_qsize 5, out_qsize 2
INFO - 01:01:01: EPOCH 17 - PROGRESS: at 67.89% examples, 48844 words/s, in_qsize 6, out_qsize 0
INFO - 01:01:02: EPOCH 17 - PROGRESS: at 69.86% examples, 48541 words/s, in_qsize 6, out_qsize 0
INFO - 01:01:03: EPOCH 17 - PROGRESS: at 72.64% examples, 48944 words/s, in_qsize 6, out_qsize 0
INFO - 01:01:04: EPOCH 17 - PR

INFO - 01:02:17: EPOCH 19 - PROGRESS: at 86.37% examples, 52002 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:18: EPOCH 19 - PROGRESS: at 87.87% examples, 51793 words/s, in_qsize 6, out_qsize 0
INFO - 01:02:19: EPOCH 19 - PROGRESS: at 89.47% examples, 51533 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:20: EPOCH 19 - PROGRESS: at 90.37% examples, 50382 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:21: EPOCH 19 - PROGRESS: at 91.21% examples, 48901 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:22: EPOCH 19 - PROGRESS: at 92.18% examples, 47909 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:23: EPOCH 19 - PROGRESS: at 93.36% examples, 47019 words/s, in_qsize 0, out_qsize 1
INFO - 01:02:24: EPOCH 19 - PROGRESS: at 94.66% examples, 46297 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:25: EPOCH 19 - PROGRESS: at 95.87% examples, 45564 words/s, in_qsize 0, out_qsize 0
INFO - 01:02:26: EPOCH 19 - PROGRESS: at 97.27% examples, 44900 words/s, in_qsize 3, out_qsize 0
INFO - 01:02:28: EPOCH 19 - PR

INFO - 01:03:39: EPOCH 21 - PROGRESS: at 99.08% examples, 51867 words/s, in_qsize 5, out_qsize 0
INFO - 01:03:40: worker thread finished; awaiting finish of 2 more threads
INFO - 01:03:40: worker thread finished; awaiting finish of 1 more threads
INFO - 01:03:40: worker thread finished; awaiting finish of 0 more threads
INFO - 01:03:40: EPOCH - 21 : training on 7487517 raw words (1788050 effective words) took 34.1s, 52469 effective words/s
INFO - 01:03:41: EPOCH 22 - PROGRESS: at 2.46% examples, 39800 words/s, in_qsize 5, out_qsize 1
INFO - 01:03:42: EPOCH 22 - PROGRESS: at 5.29% examples, 40999 words/s, in_qsize 4, out_qsize 0
INFO - 01:03:43: EPOCH 22 - PROGRESS: at 8.88% examples, 41694 words/s, in_qsize 2, out_qsize 1
INFO - 01:03:44: EPOCH 22 - PROGRESS: at 11.99% examples, 41030 words/s, in_qsize 4, out_qsize 2
INFO - 01:03:45: EPOCH 22 - PROGRESS: at 15.35% examples, 42221 words/s, in_qsize 5, out_qsize 0
INFO - 01:03:46: EPOCH 22 - PROGRESS: at 17.84% examples, 41105 words/s, i

INFO - 01:04:58: EPOCH 24 - PROGRESS: at 25.99% examples, 44926 words/s, in_qsize 6, out_qsize 0
INFO - 01:04:59: EPOCH 24 - PROGRESS: at 29.33% examples, 45876 words/s, in_qsize 3, out_qsize 0
INFO - 01:05:00: EPOCH 24 - PROGRESS: at 31.98% examples, 45319 words/s, in_qsize 5, out_qsize 0
INFO - 01:05:01: EPOCH 24 - PROGRESS: at 35.62% examples, 45615 words/s, in_qsize 5, out_qsize 0
INFO - 01:05:02: EPOCH 24 - PROGRESS: at 38.91% examples, 45406 words/s, in_qsize 5, out_qsize 2
INFO - 01:05:03: EPOCH 24 - PROGRESS: at 42.87% examples, 45995 words/s, in_qsize 5, out_qsize 0
INFO - 01:05:04: EPOCH 24 - PROGRESS: at 46.29% examples, 45968 words/s, in_qsize 3, out_qsize 2
INFO - 01:05:05: EPOCH 24 - PROGRESS: at 49.85% examples, 46174 words/s, in_qsize 6, out_qsize 0
INFO - 01:05:06: EPOCH 24 - PROGRESS: at 53.34% examples, 46165 words/s, in_qsize 6, out_qsize 0
INFO - 01:05:07: EPOCH 24 - PROGRESS: at 56.34% examples, 46634 words/s, in_qsize 6, out_qsize 0
INFO - 01:05:08: EPOCH 24 - PR

INFO - 01:06:22: EPOCH 26 - PROGRESS: at 70.85% examples, 49231 words/s, in_qsize 3, out_qsize 0
INFO - 01:06:23: EPOCH 26 - PROGRESS: at 73.17% examples, 49385 words/s, in_qsize 6, out_qsize 0
INFO - 01:06:24: EPOCH 26 - PROGRESS: at 75.82% examples, 50102 words/s, in_qsize 4, out_qsize 0
INFO - 01:06:25: EPOCH 26 - PROGRESS: at 78.69% examples, 50191 words/s, in_qsize 6, out_qsize 0
INFO - 01:06:26: EPOCH 26 - PROGRESS: at 81.84% examples, 50326 words/s, in_qsize 6, out_qsize 0
INFO - 01:06:27: EPOCH 26 - PROGRESS: at 84.77% examples, 50434 words/s, in_qsize 5, out_qsize 0
INFO - 01:06:28: EPOCH 26 - PROGRESS: at 87.49% examples, 50909 words/s, in_qsize 5, out_qsize 0
INFO - 01:06:30: EPOCH 26 - PROGRESS: at 89.59% examples, 51318 words/s, in_qsize 6, out_qsize 0
INFO - 01:06:31: EPOCH 26 - PROGRESS: at 92.75% examples, 51360 words/s, in_qsize 4, out_qsize 1
INFO - 01:06:32: EPOCH 26 - PROGRESS: at 95.87% examples, 51413 words/s, in_qsize 4, out_qsize 1
INFO - 01:06:33: EPOCH 26 - PR

INFO - 01:07:44: worker thread finished; awaiting finish of 0 more threads
INFO - 01:07:44: EPOCH - 28 : training on 7487517 raw words (1788438 effective words) took 35.8s, 49977 effective words/s
INFO - 01:07:45: EPOCH 29 - PROGRESS: at 2.31% examples, 38693 words/s, in_qsize 0, out_qsize 0
INFO - 01:07:46: EPOCH 29 - PROGRESS: at 5.00% examples, 39878 words/s, in_qsize 4, out_qsize 2
INFO - 01:07:47: EPOCH 29 - PROGRESS: at 8.88% examples, 42745 words/s, in_qsize 3, out_qsize 1
INFO - 01:07:48: EPOCH 29 - PROGRESS: at 12.13% examples, 42487 words/s, in_qsize 3, out_qsize 3
INFO - 01:07:49: EPOCH 29 - PROGRESS: at 15.64% examples, 43428 words/s, in_qsize 6, out_qsize 0
INFO - 01:07:50: EPOCH 29 - PROGRESS: at 19.96% examples, 45789 words/s, in_qsize 0, out_qsize 0
INFO - 01:07:51: EPOCH 29 - PROGRESS: at 23.79% examples, 46829 words/s, in_qsize 1, out_qsize 0
INFO - 01:07:52: EPOCH 29 - PROGRESS: at 26.35% examples, 45619 words/s, in_qsize 0, out_qsize 1
INFO - 01:07:53: EPOCH 29 - PR

Time to train the model: 22.09 mins


In [20]:
w2v_model.save("word2vec.model")

INFO - 01:08:52: saving Word2Vec object under word2vec.model, separately None
INFO - 01:08:52: storing np array 'vectors' to word2vec.model.wv.vectors.npy
INFO - 01:08:52: not storing attribute vectors_norm
INFO - 01:08:52: storing np array 'syn1neg' to word2vec.model.trainables.syn1neg.npy
INFO - 01:08:53: not storing attribute cum_table
INFO - 01:08:53: saved word2vec.model


In [21]:
file_export = file_model.copy()
file_export['old_title'] = file_export.title
file_export.old_title = file_export.old_title.str.join(' ')
file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))
file_export.rate = file_export.rate.astype('int8')

In [22]:

file_export[['title', 'rate']].to_csv('cleaned_dataset.csv', index=False)

In [26]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [27]:
word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec.model").wv

INFO - 07:48:19: loading Word2Vec object from ../preprocessing_and_embeddings/word2vec.model
INFO - 07:48:19: loading wv recursively from ../preprocessing_and_embeddings/word2vec.model.wv.* with mmap=None
INFO - 07:48:19: loading vectors from ../preprocessing_and_embeddings/word2vec.model.wv.vectors.npy with mmap=None
INFO - 07:48:20: setting ignored attribute vectors_norm to None
INFO - 07:48:20: loading vocabulary recursively from ../preprocessing_and_embeddings/word2vec.model.vocabulary.* with mmap=None
INFO - 07:48:20: loading trainables recursively from ../preprocessing_and_embeddings/word2vec.model.trainables.* with mmap=None
INFO - 07:48:20: loading syn1neg from ../preprocessing_and_embeddings/word2vec.model.trainables.syn1neg.npy with mmap=None
INFO - 07:48:20: setting ignored attribute cum_table to None
INFO - 07:48:20: loaded ../preprocessing_and_embeddings/word2vec.model


In [28]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [29]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

INFO - 07:57:41: precomputing L2-norms of word weight vectors


[('profesjonalna_obsluga/', 0.9401209354400635),
 ('ladne_garnki', 0.9310925006866455),
 ('bezprolemowo', 0.926703929901123),
 ('nez_problemow', 0.9123505353927612),
 ('cudowna_wspolpraca', 0.9001386165618896),
 ('pelen_profesjonalim', 0.8995329141616821),
 ('ybko', 0.8930517435073853),
 ('bezproblemowow', 0.8812028169631958),
 ('przytepne', 0.8762412071228027),
 ('towar_zgodnyz', 0.8729431629180908)]

In [30]:

positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [31]:

words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [33]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,polecam,"[-0.01980895, 0.022030339, -0.06727692, -0.051...",0,1,1.086555,1.086555
1,nie,"[-0.038373433, -0.030603966, -0.019125922, 0.0...",1,-1,1.079863,-1.079863
2,pierwszy,"[-0.05512315, 0.04625909, -0.10541313, -0.0739...",0,1,0.981306,0.981306
3,i,"[-0.023885915, -0.036067966, -0.017669266, 0.0...",0,1,1.042999,1.042999
4,ostatni,"[-0.056395046, -0.019210044, -0.097046375, -0....",0,1,0.991501,0.991501
5,raz,"[0.0090312995, 0.039561126, 0.00088073657, 0.0...",0,1,0.981433,0.981433
6,!,"[0.0025381062, -0.056386854, -0.0658799, 0.041...",0,1,1.029283,1.029283
7,bardzo,"[0.005475564, 0.015712863, -0.109656475, -0.00...",0,1,1.087052,1.087052
8,dobra_komunikacja,"[0.014074041, -0.038561597, -0.048941635, 0.00...",0,1,1.040476,1.040476
9,sms,"[-0.075536475, 0.00052360346, -0.033483103, 0....",1,-1,1.00148,-1.00148


In [34]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [37]:
final_file = pd.read_csv('cleaned_dataset.csv')

In [39]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [40]:
file_weighting = final_file.copy()


In [41]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)



In [42]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [43]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

Wall time: 5min 13s


In [44]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [45]:
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [46]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [47]:

replacement_df

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment,sentiment_rate,prediction
0,"[1.086554531826789, -1.0798634190937, 0.981306...","[1.8727707163903236, 6.599017534847987, 6.5110...",polecam nie pierwszy i nie ostatni raz !,1,10.917805,1
1,"[1.0870520332447249, 1.0404762505856682, -1.00...","[2.2050406906691054, 7.0789201195193305, 7.336...",bardzo dobra_komunikacja sms i telefoniczna za...,1,3.145160,1
2,"[1.086554531826789, 1.0431547828185102, -0.998...","[1.8727707163903236, 3.75718487435639, 2.29420...",polecam zakupy w tym_sklepie sa_dostepne czesc...,1,-24.934678,0
3,"[1.035737404856952, -0.9987550632429492, 1.042...","[3.3032703526520923, 2.2942005964846954, 6.727...",jestem w pelni zadowolona z przebiegu_transakcji,1,22.611122,1
4,"[1.0153237055080375, 0.9909470571066472, 1.075...","[4.795254417467228, 5.616755373457475, 5.51431...",transakcja_przebiegla blyskawicznie pelen_prof...,1,31.430042,1
...,...,...,...,...,...,...
645226,"[-1.0591985745872512, -1.0208472498029153, 1.0...","[3.636760022521373, 4.3033756885027366, 7.1035...",to juz moje_kolejne zakupy ogromny_wybor towar...,1,71.151348,1
645227,"[1.0622491900392836, 1.063087474971026, 1.0028...","[2.788952021927305, 6.35381282862313, 7.221968...",sklep fajny robie w nim duzo zakupow,1,32.773911,1
645228,"[0.9943928072952801, -0.9987550632429492, 0.99...","[3.0351213211703048, 2.2942005964846954, 4.652...",wszystko w porzadku polecam ! !,1,18.723207,1
645229,"[1.094910131985734, 1.043115323523781, 1.06957...","[11.432926241861038, 7.840673623436173, 12.991...",interesujaca_oferta czytelna kategoryzacja pro...,1,47.227990,1


In [48]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,9495,334
1,105334,530068



 
 Scores


Unnamed: 0,scores
accuracy,0.836232
precision,0.99937
recall,0.834225
f1,0.90936
