# 0. Getting ready

In [1]:
import math
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import os
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import WordNetLemmatizer
from autocorrect import Speller
import warnings
import multiprocessing as mp
warnings.filterwarnings('ignore')
from numpy.linalg import norm
import itertools
from numpy import dot

In [2]:
os.getcwd() + 'data\\'

'C:\\Users\\Vladimir\\DataspellProjects\\tweetsdata\\'

In [3]:
a = pd.read_csv('data/processedNegative.csv', header=None).T.assign(sentiment=-1)
b = pd.read_csv('data/processedNeutral.csv', header=None).T.assign(sentiment=0)
c = pd.read_csv('data/processedPositive.csv', header=None).T.assign(sentiment=1)

my_tweets = []
my_tweets = pd.concat([a, b, c], axis = 0, ignore_index=True)
my_tweets = shuffle(my_tweets)
my_tweets.columns = ['tweets', 'sentiment']
my_tweets.dropna(subset=['tweets'], inplace = True)
my_tweets

Unnamed: 0,tweets,sentiment
12,Errr dude.... They're gone unhappy Asked othe...,-1
1651,as petitioner wants.,0
1456,Daddy dearest,0
1110,unhappy they not,-1
2536,claims he has the masses. Who decides?,0
...,...,...
3169,hi cham,1
2359,5-judge constitution bench to sit in May to de...,0
3160,Thank you so much my friend smile xx,1
2577,Shades of in law change,0


# 1. Data preparation

## tokenization

In [4]:
def tokenizer(col, is_stopwords):
    tokens = nltk.word_tokenize(str(col).lower())
    # delete all punctuation marks and numbers
    tokens = [i for i in tokens if (i not in string.punctuation)]
    tokens = [i for i in tokens if i.isalpha()]

    # delete stop words
    if (not is_stopwords):
        stop_words = stopwords.words('english')
        tokens = [i for i in tokens if (i not in stop_words)]

    return tokens

In [5]:
tweets = my_tweets
tweets['tokens'] = tweets['tweets'].apply(lambda  x : tokenizer(x, False))
tweets['tokens_with_stopwords'] = tweets['tweets'].apply(lambda  x : tokenizer(x, True))

tweets = tweets[tweets['tokens'].map(lambda x : len(x)) > 0]
tweets = tweets[tweets['tokens_with_stopwords'].map(lambda x : len(x)) > 0]
tweets.reset_index(drop=True, inplace = True)
tweets

Unnamed: 0,tweets,sentiment,tokens,tokens_with_stopwords
0,Errr dude.... They're gone unhappy Asked othe...,-1,"[errr, dude, gone, unhappy, asked, league, mem...","[errr, dude, they, gone, unhappy, asked, other..."
1,as petitioner wants.,0,"[petitioner, wants]","[as, petitioner, wants]"
2,Daddy dearest,0,"[daddy, dearest]","[daddy, dearest]"
3,unhappy they not,-1,[unhappy],"[unhappy, they, not]"
4,claims he has the masses. Who decides?,0,"[claims, masses, decides]","[claims, he, has, the, masses, who, decides]"
...,...,...,...,...
3825,hi cham,1,"[hi, cham]","[hi, cham]"
3826,5-judge constitution bench to sit in May to de...,0,"[constitution, bench, sit, may, decide, pleas,...","[constitution, bench, to, sit, in, may, to, de..."
3827,Thank you so much my friend smile xx,1,"[thank, much, friend, smile, xx]","[thank, you, so, much, my, friend, smile, xx]"
3828,Shades of in law change,0,"[shades, law, change]","[shades, of, in, law, change]"


## stemming

In [6]:
stemmer = SnowballStemmer(language='english')
tweets['stemmed'] = tweets['tokens'].apply(lambda x : [stemmer.stem(word) for word in x])
tweets

Unnamed: 0,tweets,sentiment,tokens,tokens_with_stopwords,stemmed
0,Errr dude.... They're gone unhappy Asked othe...,-1,"[errr, dude, gone, unhappy, asked, league, mem...","[errr, dude, they, gone, unhappy, asked, other...","[errr, dude, gone, unhappi, ask, leagu, memeb,..."
1,as petitioner wants.,0,"[petitioner, wants]","[as, petitioner, wants]","[petition, want]"
2,Daddy dearest,0,"[daddy, dearest]","[daddy, dearest]","[daddi, dearest]"
3,unhappy they not,-1,[unhappy],"[unhappy, they, not]",[unhappi]
4,claims he has the masses. Who decides?,0,"[claims, masses, decides]","[claims, he, has, the, masses, who, decides]","[claim, mass, decid]"
...,...,...,...,...,...
3825,hi cham,1,"[hi, cham]","[hi, cham]","[hi, cham]"
3826,5-judge constitution bench to sit in May to de...,0,"[constitution, bench, sit, may, decide, pleas,...","[constitution, bench, to, sit, in, may, to, de...","[constitut, bench, sit, may, decid, plea, rela..."
3827,Thank you so much my friend smile xx,1,"[thank, much, friend, smile, xx]","[thank, you, so, much, my, friend, smile, xx]","[thank, much, friend, smile, xx]"
3828,Shades of in law change,0,"[shades, law, change]","[shades, of, in, law, change]","[shade, law, chang]"


## lemmatization

In [7]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
tweets['lemmatized'] = tweets['tokens'].apply(lambda x : [lemmatizer.lemmatize(word) for word in x])
tweets
# could be also used misspelling

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Vladimir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,tweets,sentiment,tokens,tokens_with_stopwords,stemmed,lemmatized
0,i took a full drink into my uber; i guess you ...,1,"[took, full, drink, uber, guess, could, say, i...","[i, took, a, full, drink, into, my, uber, i, g...","[took, full, drink, uber, guess, could, say, i...","[took, full, drink, uber, guess, could, say, i..."
1,I need cheering up,-1,"[need, cheering]","[i, need, cheering, up]","[need, cheer]","[need, cheering]"
2,imagine if he wins next time too unhappy unha...,-1,"[imagine, wins, next, time, unhappy, unhappy]","[imagine, if, he, wins, next, time, too, unhap...","[imagin, win, next, time, unhappi, unhappi]","[imagine, win, next, time, unhappy, unhappy]"
3,follow mee plss unhappy,-1,"[follow, mee, plss, unhappy]","[follow, mee, plss, unhappy]","[follow, mee, plss, unhappi]","[follow, mee, plss, unhappy]"
4,Education rankings throw up surprises.,0,"[education, rankings, throw, surprises]","[education, rankings, throw, up, surprises]","[educ, rank, throw, surpris]","[education, ranking, throw, surprise]"
...,...,...,...,...,...,...
3825,IF YOU ARE READING THIS I HOPE SOMETHING GREAT...,1,"[reading, hope, something, great, happens, smile]","[if, you, are, reading, this, i, hope, somethi...","[read, hope, someth, great, happen, smile]","[reading, hope, something, great, happens, smile]"
3826,What a sad life it must be,1,"[sad, life, must]","[what, a, sad, life, it, must, be]","[sad, life, must]","[sad, life, must]"
3827,Offline unhappy,-1,"[offline, unhappy]","[offline, unhappy]","[offlin, unhappi]","[offline, unhappy]"
3828,People who don%27t know me always think I am q...,1,"[people, know, always, think, quiet, people, k...","[people, who, don, know, me, always, think, i,...","[peopl, know, alway, think, quiet, peopl, know...","[people, know, always, think, quiet, people, k..."


## misspelling

In [7]:
spell = Speller(lang = 'en')
tweets['misspelling'] = tweets['tokens'].apply(lambda x : [stemmer.stem(spell(word)) for word in x])

In [8]:
tweets['lemma + misspell'] = tweets['tokens'].apply(lambda x : [lemmatizer.lemmatize(spell(word)) for word in x])

# 2. Vectorization

## Word exists

In [7]:
we = CountVectorizer(binary=True)
x_we = we.fit_transform(tweets["tokens"].astype('str'))

In [7]:
x_we

<3830x5910 sparse matrix of type '<class 'numpy.int64'>'
	with 19261 stored elements in Compressed Sparse Row format>

## Bag of words

In [9]:
cv = CountVectorizer()
x_bow = cv.fit_transform(tweets["tokens"].astype('str'))


## TF-IDF

In [12]:
tv = TfidfVectorizer()
x_tfidf = tv.fit_transform(tweets["tokens"].astype('str'))


## Word2Vec

In [160]:
mp.cpu_count()

8

In [38]:
modelW2V = Word2Vec(
                    window=3,
                    sample=6e-5,
                    alpha=0.03,
                    min_alpha=0.0007,
                    negative=10,
                    vector_size=300, # number of neurons in hidden layer of NN
                    min_count=5, # word will count if it occurs N times in all tweets
                    workers=8) # depends on number of CPU cores

In [39]:
modelW2V.build_vocab(tweets['tokens'], progress_per=10000)

In [40]:
modelW2V.train(tweets['tokens'], total_examples=modelW2V.corpus_count, epochs=50, report_delay=1)

(132634, 982300)

In [28]:
modelW2V.wv[1] # vector of values of hidden layer neurons for the word

array([-1.51315108e-01,  5.50881565e-01, -9.51377116e-03,  2.80250639e-01,
        3.60852838e-01,  4.00167471e-03,  3.06205750e-01,  6.20851517e-01,
        1.39004648e-01, -6.04955666e-03, -1.59224570e-01, -3.20364207e-01,
       -5.83596621e-03, -4.15915936e-01, -1.88780218e-01, -6.73910558e-01,
       -4.97146063e-02,  9.10909548e-02, -2.03169510e-01,  2.72166245e-02,
       -5.54300487e-01, -2.62648582e-01,  1.72311872e-01,  1.23393558e-01,
        2.35696346e-01, -1.03093155e-01, -3.67407799e-01,  1.17574632e-01,
       -1.00561447e-01, -5.41165173e-01, -6.18582591e-02, -9.27833617e-02,
        8.54666755e-02, -3.47621173e-01, -1.93278193e-01,  1.56919867e-01,
        2.09297523e-01, -3.45100760e-01,  1.04102165e-01,  2.42870435e-01,
       -5.73552512e-02,  5.03180623e-02,  1.32043034e-01, -5.34826294e-02,
       -2.02198192e-01,  1.78821161e-01,  2.45689422e-01, -3.21193129e-01,
       -1.63090557e-01,  1.01494126e-01,  2.91389897e-02, -2.05518380e-01,
       -3.38081837e-01,  

In [43]:
modelW2V.wv.most_similar(positive=["man"])

[('charge', 0.9996358156204224),
 ('place', 0.9996321201324463),
 ('work', 0.9996307492256165),
 ('social', 0.9996282458305359),
 ('put', 0.999627411365509),
 ('wants', 0.9996249079704285),
 ('today', 0.9996242523193359),
 ('governor', 0.9996238946914673),
 ('really', 0.9996238350868225),
 ('cm', 0.9996156692504883)]

# 3. Classification for sentiment analysis

In [None]:
# function to compute tweet vectors from word vectors
def text2vec(text, model_w2v):
    sent_vectors = []
    for word in text:
        try:
            sent_vectors.append(model_w2v.wv[word]) # appending all word vectors
        except KeyError:
            pass
    return np.sum(sent_vectors, axis=0) / len(sent_vectors) # normalization

In [103]:
# vectorisation methods
meth_dict = {"WE" : CountVectorizer(binary=True),
             "BoW" : CountVectorizer(),
             "TF-IDF" : TfidfVectorizer(),
             "Word2Vec" : modelW2V
             }

# classifiers
class_list = [
    RandomForestClassifier(n_estimators=1000, random_state=1),
    SVC(C = 0.5, kernel='linear', gamma='auto'), # these parameters were established using by GridSearch
    LogisticRegression(solver='liblinear')
]

In [104]:
scores = {}
for method in tweets.columns[2:]:
    for name, vectorizer in meth_dict.items():
        for classifier in class_list:
            if name == "Word2Vec":
                vectorizer.build_vocab(tweets[method], update=True)  # prepare the model vocabulary
                vectorizer.train(tweets[method], total_examples=vectorizer.corpus_count, epochs= 400) #vectorizer.epochs)
                # train word vectors
                X = np.stack(tweets[method].apply(lambda row: text2vec(row, vectorizer)))
            else:
                X = vectorizer.fit_transform(tweets[method].astype("str"))
            X_train, X_test, y_train, y_test = train_test_split(X, tweets['sentiment'], test_size = 0.2, stratify=tweets['sentiment'])
            classifier.fit(X_train, y_train)
        scores[round(classifier.score(X_test, y_test), 3)] = name, method, classifier.__class__.__name__
        print("Vectorizer: {}, Classifier: {}, Preprocessing method: {}, \nAccuracy: {}".format(
            name, classifier.__class__.__name__, method, round(classifier.score(X_test, y_test), 3)))


Vectorizer: WE, Classifier: LogisticRegression, Preprocessing method: tokens, 
Accuracy: 0.862
Vectorizer: BoW, Classifier: LogisticRegression, Preprocessing method: tokens, 
Accuracy: 0.858
Vectorizer: TF-IDF, Classifier: LogisticRegression, Preprocessing method: tokens, 
Accuracy: 0.851
Vectorizer: Word2Vec, Classifier: LogisticRegression, Preprocessing method: tokens, 
Accuracy: 0.834
Vectorizer: WE, Classifier: LogisticRegression, Preprocessing method: tokens_with_stopwords, 
Accuracy: 0.875
Vectorizer: BoW, Classifier: LogisticRegression, Preprocessing method: tokens_with_stopwords, 
Accuracy: 0.881
Vectorizer: TF-IDF, Classifier: LogisticRegression, Preprocessing method: tokens_with_stopwords, 
Accuracy: 0.849
Vectorizer: Word2Vec, Classifier: LogisticRegression, Preprocessing method: tokens_with_stopwords, 
Accuracy: 0.851
Vectorizer: WE, Classifier: LogisticRegression, Preprocessing method: stemmed, 
Accuracy: 0.871
Vectorizer: BoW, Classifier: LogisticRegression, Preprocessing

In [115]:
m = max(scores.keys())
print("The best result:\nVectorizer: {}, Classifier: {}, Preprocessing method: {}, \nAccuracy: {}".format(
    scores[m][0], scores[m][2], scores[m][1], m))

The best result:
Vectorizer: WE, Classifier: lemmatized, Preprocessing method: LogisticRegression, 
Accuracy: 0.888


## GridSearch model

GridSearchCV – это очень мощный инструмент для автоматического подбирания параметров для моделей машинного обучения. GridSearchCV находит наилучшие параметры, путем обычного перебора: он создает модель для каждой возможной комбинации параметров. Важно отметить, что такой подход может быть весьма времязатратным

https://vc.ru/ml/147132-kak-avtomaticheski-podobrat-parametry-dlya-modeli-mashinnogo-obucheniya-ispolzuem-gridsearchcv


In [None]:
# C-Support Vector Classification

cv = CountVectorizer()
X = cv.fit_transform(tweets['stemmed'].astype('str'))

param_grid = {'C' : [0.5, 0.7, 0.9, 1],
              'gamma' : [10, 5, 4, 3, 2, 1.5],
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 1, n_jobs=10, cv=10)
grid.fit(X, tweets['sentiment'])


In [47]:
grid.best_params_

{'C': 0.5, 'gamma': 10, 'kernel': 'linear'}

In [49]:
grid.best_score_

0.8741514360313316

# 4. Cosine similarity
Cosine similarity is a metric, helpful in determining, how similar the data objects are irrespective of their size. We can measure the similarity between two sentences in Python using Cosine Similarity. In cosine similarity, data objects in a dataset are treated as a vector. The formula to find the cosine similarity between two vectors is –

Cos(x, y) = x . y / ||x|| * ||y||

https://www.geeksforgeeks.org/cosine-similarity/

In [51]:
# available preprocessing methods
for method in tweets.columns[2:]:
    print(method + '\n')
    tweets[method].values

tokens

tokens_with_stopwords

stemmed



In [15]:
# function to find cosine distance
def get_cosine(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [16]:
def find_dist(sent): #sentence

    # to avoid duplicates
    sent = sent.apply(lambda x: ' '.join(x))
    sent.drop_duplicates(inplace=True)
    sent.reset_index(inplace=True, drop=True)
    sent = sent.apply(lambda x: x.split(' '))

    # to train model
    m = Word2Vec(sent,  vector_size=300,
                        min_count=1 ,
                        workers=8)
    m.build_vocab(tweets['tokens'], progress_per=10000)
    m.train(tweets['tokens'], total_examples=m.corpus_count, epochs=50, report_delay=1)

    # to create tweet vector from word vectors
    sent_vectors = []
    for word in sent:
        sent_vectors.append(np.mean([m.wv[j] for j in word], axis=0))

    # to count vector distances
    distances = {}
    r = 0
    for i in range(0, len(sent_vectors)):
       for j in range(i, len(sent_vectors)):
    #for i in range(0, 10):
        #for j in range(i, 10):
            if i != j:
                r += 1
                distances[r] = (get_cosine(sent_vectors[i], sent_vectors[j]), i, j)

    # to sort distances
    s = dict(sorted(distances.items(), key=lambda x: x[1][0], reverse=True))
    # to return top 10 similar tweets
    return dict(itertools.islice(s.items(), 10))



In [101]:
# top 10 similar distances for tokens

res = find_dist(tweets['tokens'])
res

{815536: (1.0000001, 254, 1847),
 89632: (0.9934575, 27, 73),
 1258702: (0.9933491, 402, 643),
 3174197: (0.9932755, 1151, 3192),
 89593: (0.9917485, 27, 34),
 1710500: (0.9910057, 560, 2220),
 113302: (0.99075913, 34, 643),
 198355: (0.98992485, 60, 325),
 1066621: (0.9895986, 337, 1027),
 2895065: (0.98914653, 1027, 2006)}

In [102]:
for i, j, k in res.values():
    print(i, '|' + tweets['tweets'][j], '\t|' + tweets['tweets'][k])

1.0000001 |this is jimin to yoongi unhappy   	|i want to hang out with them unhappy 
0.9934575 |BJP urges Governor Vidyasagar Rao to take a decision on basis of numbers as well as credibility.  	|to WIN draw for free download of for Celebrate with us. Developed by for
0.9933491 |52 injured as Jabalpur-Delhi train derails near half train carries on after delay.  	|Lunchtime with Flinthook! This game has so much personality :D
0.9932755 | but I liked seeing her posts and love her wri  	| they're like 90 calories per piece unhappy  
0.9917485 |BJP urges Governor Vidyasagar Rao to take a decision on basis of numbers as well as credibility.  	|not clear right now. Speaker to decide....
0.9910057 | House adjourned till 3pm. 	|How theatre festival got muffled. 
0.99075913 |not clear right now. Speaker to decide.... 	|Lunchtime with Flinthook! This game has so much personality :D
0.98992485 |Thank you crew happy 	|How self-belief has kept him on top of the game. 
0.9895986 |Defence Partnership

In [18]:
# top 10 similar distances for stemmed
res = find_dist(tweets['stemmed'])
for i, j, k in res.values():
    print(i, '|' + tweets['tweets'][j], '\t|' + tweets['tweets'][k])

0.99999994 |.I can't believe this hasn't been fixed yet 	|Supreme Court quashes criminal complaint against cricketer for allegedly depicting himself as on magazine cover.
0.9971929 |Thanks for the recent follow Happy to connect happy  have a great Thursday. Want it 	| contact. 
0.99715406 |Reservoirs to financial deal 	|Thanks for being top engaged community members this week happy   Want this
0.9971358 |Death by currency 	|Such a lucky slave i wish this was me unhappy 
0.9967134 |000 in villages are taught to Google designs 	|One year ago today unhappy  
0.99668497 |Hi! We tried to call your number but got no response unhappy  Please share another suitable time and an alternate number.. cont1 	|000 in villages are taught to Google designs
0.9963405 |On our way home happy  happy si Noah sa skyranch 	|From soup and sandwiches to slow cooked pork
0.9959435 |Thanks for the recent follow Happy to connect happy  have a great Thursday. Want it 	|Merry Christmas everyone happy
0.99588853 |i k