In [1]:
# Importing Packages 

import numpy as np
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Importing Text

data = pd.read_csv("Uber_Category_3000.csv",encoding='latin1')
data.head()

Unnamed: 0,Tweets,Category
0,hi we understand this can be upsetting driver ...,1
1,hey savani weve fixed this for you and process...,1
2,hey sanju for us to assist you better kindly h...,1
3,we understand your concern nikhil we have made...,0
4,hey vinay were sorry to hear about the trouble...,1


In [3]:
# Cleaning Text

df_clean = data
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df_clean['clean'] = df_clean['Tweets'].astype('str') 
df_clean.dtypes

df_clean["tokens"] = df_clean["clean"].apply(tokenizer.tokenize)

df_clean.head()

Unnamed: 0,Tweets,Category,clean,tokens
0,hi we understand this can be upsetting driver ...,1,hi we understand this can be upsetting driver ...,"[hi, we, understand, this, can, be, upsetting,..."
1,hey savani weve fixed this for you and process...,1,hey savani weve fixed this for you and process...,"[hey, savani, weve, fixed, this, for, you, and..."
2,hey sanju for us to assist you better kindly h...,1,hey sanju for us to assist you better kindly h...,"[hey, sanju, for, us, to, assist, you, better,..."
3,we understand your concern nikhil we have made...,0,we understand your concern nikhil we have made...,"[we, understand, your, concern, nikhil, we, ha..."
4,hey vinay were sorry to hear about the trouble...,1,hey vinay were sorry to hear about the trouble...,"[hey, vinay, were, sorry, to, hear, about, the..."


In [4]:
# Importing Word2Vec

import gensim
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
import multiprocessing
from gensim.models import Word2Vec

# WORD2VEC()
cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

#BUILD_VOCAB()

w2v_model.build_vocab(df_clean["tokens"], progress_per=1000)


#TRAIN()

w2v_model.train(df_clean["tokens"], total_examples=w2v_model.corpus_count, epochs=10000, report_delay=1)

(107030736, 941840000)

In [6]:
#words similar to thanks
w2v_model.wv.most_similar(positive=["thanks"])

#Words similar to please
w2v_model.wv.most_similar(positive=["please"])

[('via', 0.40942710638046265),
 ('help', 0.3877283036708832),
 ('section', 0.3641948699951172),
 ('app', 0.35763347148895264),
 ('message', 0.3463286757469177),
 ('your', 0.33568429946899414),
 ('direct', 0.32566604018211365),
 ('us', 0.3011002540588379),
 ('number', 0.2969534397125244),
 ('address', 0.27990180253982544)]

In [7]:
#First defining the X (input), and the y (output)

y = data['Category'].values
X = np.array(df_clean["tokens"])

#And here is the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [8]:
# Creating Vectorizer 

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)

In [9]:
# Transforming & Fitting Vectorizer to Train Data

X_train = vectorizer.fit_transform(X_train)

# Transforming Vectorizer to Test Data

X_test = vectorizer.transform(X_test)

In [10]:
# Getting Number of Features from the Vectorizer 
 
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))

Number of features: 391


In [11]:
# Calculating the Mean Cross-Validation Accuracy

scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=5)
print("Mean Cross-Validation accuracy: {:.2f}".format(np.mean(scores)))

Mean Cross-Validation accuracy: 0.95


In [54]:
rf = RandomForestClassifier(n_estimators=100, bootstrap= True, max_features = 'sqrt')
rf.fit(X_train, y_train)
print("Training set score: {:.3f}".format(rf.score(X_train, y_train)))
print("Testing set score: {:.3f}".format(rf.score(X_test, y_test)))


Training set score: 1.000
Testing set score: 0.965


In [14]:
pred_rf = rf.predict(X_test)
confusion = confusion_matrix(y_test, pred_rf)
print("Confusion Matrix: \n{}".format(confusion))

Confusion Matrix: 
[[ 67  20]
 [  1 512]]
