In [1]:
# Importing Packages 

import numpy as np
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [73]:
# Importing Text

data = pd.read_csv("Ola_Category_3000.csv",encoding='latin1')
data.head()

Unnamed: 0,Tweets,Category
0,thank you for sending us your crns and showing...,1
1,\r\nworry not autoconnect lure mode in all ola...,0
2,kabaliday,0
3,were in do let us know how we can help,0
4,done,0


In [74]:
# Cleaning Text

df_clean = data
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['Tweets'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)

df_clean.head()

Unnamed: 0,Tweets,Category,clean,tokens
0,thank you for sending us your crns and showing...,1,thank you for sending us your crns and showing...,"[thank, you, for, sending, us, your, crns, and..."
1,\r\nworry not autoconnect lure mode in all ola...,0,\r\nworry not autoconnect lure mode in all ola...,"[worry, not, autoconnect, lure, mode, in, all,..."
2,kabaliday,0,kabaliday,[kabaliday]
3,were in do let us know how we can help,0,were in do let us know how we can help,"[were, in, do, let, us, know, how, we, can, help]"
4,done,0,done,[done]


In [4]:
# Importing Word2Vec

import gensim
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [75]:
import multiprocessing
from gensim.models import Word2Vec

# WORD2VEC()
cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

#BUILD_VOCAB()

w2v_model.build_vocab(df_clean["tokens"], progress_per=1000)


#TRAIN()

w2v_model.train(df_clean["tokens"], total_examples=w2v_model.corpus_count, epochs=10000, report_delay=1)

(44404564, 464410000)

In [76]:
#First defining the X (input), and the y (output)

y = data['Category'].values
X = np.array(df_clean["tokens"])

#And here is the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [77]:
# Creating Vectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)

In [78]:
# Transforming & Fitting Vectorizer to Train Data

X_train = vectorizer.fit_transform(X_train)

# Transforming Vectorizer to Test Data

X_test = vectorizer.transform(X_test)

In [79]:
# Getting Number of Features from the Vectorizer 

feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))

Number of features: 536


In [80]:
# Calculating the Mean Cross-Validation Accuracy

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean Cross-Validation accuracy: {:.2f}".format(np.mean(scores)))

Mean Cross-Validation accuracy: 0.79


In [81]:
# Fitting Train Data with Logistic Regression Algorithm

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Testing set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.857
Testing set score: 0.819


In [82]:
# Predicting the Sentiments with the Test Data & Creating the Confusion Matrix

pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion Matrix: \n{}".format(confusion))

Confusion Matrix: 
[[ 62  95]
 [ 13 426]]


In [83]:
# Fitting Train Data with Multinomial Naive Bayes Algorithm

nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Training set score: {:.3f}".format(nb.score(X_train, y_train)))
print("Testing set score: {:.3f}".format(nb.score(X_test, y_test)))

Training set score: 0.793
Testing set score: 0.755


In [84]:
# Predicting the Sentiments with the Test Data & Creating the Confusion Matrix

pred_nb = nb.predict(X_test)
confusion = confusion_matrix(y_test, pred_nb)
print("Confusion Matrix: \n{}".format(confusion))

Confusion Matrix: 
[[ 21 136]
 [ 10 429]]


In [85]:
# Fitting Train Data with Random Forest Algorithm

rf = RandomForestClassifier(n_estimators=100, bootstrap= True, max_features = 'sqrt')
rf.fit(X_train, y_train)
print("Training set score: {:.3f}".format(rf.score(X_train, y_train)))
print("Testing set score: {:.3f}".format(rf.score(X_test, y_test)))

Training set score: 0.996
Testing set score: 0.834


In [86]:
# Predicting the Sentiments with the Test Data & Creating the Confusion Matrix

pred_rf = rf.predict(X_test)
confusion = confusion_matrix(y_test, pred_rf)
print("Confusion Matrix: \n{}".format(confusion))

Confusion Matrix: 
[[ 83  74]
 [ 25 414]]
