In [2]:
# Importing Packages 

import numpy as np
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [3]:
# Importing Text

data = pd.read_csv("Uber_Category_500.csv",encoding='latin1')
data.head()

Unnamed: 0,Tweets,Category
0,please be informed that the uber credits that ...,1
1,hey babu we would like to take a closer look a...,1
2,hey roshan were grateful to have such uberstar...,1
3,hey rakesh could you please share the date and...,1
4,hey ananth we understand this can be upsetting...,0


In [4]:
# Cleaning Text

df_clean = data
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['Tweets'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)

df_clean.head()

Unnamed: 0,Tweets,Category,clean,tokens
0,please be informed that the uber credits that ...,1,please be informed that the uber credits that ...,"[please, be, informed, that, the, uber, credit..."
1,hey babu we would like to take a closer look a...,1,hey babu we would like to take a closer look a...,"[hey, babu, we, would, like, to, take, a, clos..."
2,hey roshan were grateful to have such uberstar...,1,hey roshan were grateful to have such uberstar...,"[hey, roshan, were, grateful, to, have, such, ..."
3,hey rakesh could you please share the date and...,1,hey rakesh could you please share the date and...,"[hey, rakesh, could, you, please, share, the, ..."
4,hey ananth we understand this can be upsetting...,0,hey ananth we understand this can be upsetting...,"[hey, ananth, we, understand, this, can, be, u..."


In [4]:
# Importing Word2Vec

import gensim
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
import multiprocessing
from gensim.models import Word2Vec

# WORD2VEC()
cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

#BUILD_VOCAB()

w2v_model.build_vocab(df_clean["tokens"], progress_per=1000)


#TRAIN()

w2v_model.train(df_clean["tokens"], total_examples=w2v_model.corpus_count, epochs=10000, report_delay=1)

(107022823, 941840000)

In [6]:
#words similar to thanks
w2v_model.wv.most_similar(positive=["thanks"])

#Words similar to please
w2v_model.wv.most_similar(positive=["please"])

[('via', 0.3977193236351013),
 ('help', 0.3931013345718384),
 ('section', 0.3802177309989929),
 ('direct', 0.347443163394928),
 ('app', 0.3397883176803589),
 ('message', 0.3341812193393707),
 ('your', 0.3321651518344879),
 ('us', 0.32056373357772827),
 ('number', 0.28352048993110657),
 ('email', 0.2777360677719116)]

In [5]:
#First defining the X (input), and the y (output)

y = data['Category'].values
X = np.array(df_clean["tokens"])

#And here is the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [8]:
# Creating Vectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)

In [9]:
# Transforming & Fitting Vectorizer to Train Data

X_train = vectorizer.fit_transform(X_train)

# Transforming Vectorizer to Test Data

X_test = vectorizer.transform(X_test)

In [10]:
# Getting Number of Features from the Vectorizer 

feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))

Number of features: 391


In [10]:
# Calculating the Mean Cross-Validation Accuracy

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean Cross-Validation accuracy: {:.2f}".format(np.mean(scores)))

Mean Cross-Validation accuracy: 0.84


In [12]:
# Fitting Train Data with Logistic Regression Algorithm

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Testing set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.931
Testing set score: 0.927


In [13]:
# Predicting the Sentiments with the Test Data & Creating the Confusion Matrix

pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion Matrix: \n{}".format(confusion))

Confusion Matrix: 
[[ 45  42]
 [  2 511]]
