In [47]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [48]:
### import the libraries related to NLP

import nltk
import re
import string

### sample customer reviews data

In [60]:
review_data = pd.read_csv("main_customer_reviews.csv", encoding='cp1252')

In [61]:
review_data.head()

Unnamed: 0,customerID,experience_with_device,user_interface,overall_experience,label
0,1001,good experience and I liked it.,8,8,positive
1,1002,enjoyed it a lot. No isses with the device,9,8,positive
2,1003,easy to monitor the results,9,10,positive
3,1004,not satisfied,3,5,negative
4,1004,hard to control,3,6,negative


In [62]:
review_data["experience_with_device"]

0                       good experience and I liked it.
1            enjoyed it a lot. No isses with the device
2                           easy to monitor the results
3                                         not satisfied
4                                       hard to control
                            ...                        
86    This product is really amazing. And I'm buying...
87    I have been using this product for quite a whi...
88    Its quite complicated to use and takes a lot o...
89    I didn’t expect the product to be this accurat...
90    I was skeptical about buying this product but ...
Name: experience_with_device, Length: 91, dtype: object

### lets work on the raw text data

In [63]:
stopwords = nltk.corpus.stopwords.words("english")

In [64]:
ps= nltk.PorterStemmer()

## the above method performs all cleaning operations

In [65]:

def clean_text(text):
    text = "".join(word for word in text if word not in string.punctuation)
    tokens = re.split("\W+", text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [66]:
review_data["cleaned_text"]= review_data["experience_with_device"].apply(lambda x: clean_text(x))

In [67]:
review_data.head()

Unnamed: 0,customerID,experience_with_device,user_interface,overall_experience,label,cleaned_text
0,1001,good experience and I liked it.,8,8,positive,"[good, experi, I, like]"
1,1002,enjoyed it a lot. No isses with the device,9,8,positive,"[enjoy, lot, No, iss, devic]"
2,1003,easy to monitor the results,9,10,positive,"[easi, monitor, result]"
3,1004,not satisfied,3,5,negative,[satisfi]
4,1004,hard to control,3,6,negative,"[hard, control]"


# tasks to do

### 1. Feature extraction and add new features

 ### 2. perform lemmatization by changing the clean_text method or implement a new method with lemmatization

#### before vectorizing, lets split the data into train, test data


In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(review_data[["experience_with_device","user_interface","overall_experience"]], review_data["label"] , test_size=0.2)

# vectorization

In [69]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [70]:
## count vectorization
cv_model = CountVectorizer(analyzer=clean_text)
cv_fit_model = cv_model.fit(X_train["experience_with_device"])
count_train = cv_fit_model.transform(X_train["experience_with_device"])
count_test = cv_fit_model.transform(X_test["experience_with_device"])


In [71]:
## in the same way, build tf-idf vectorizer
tfidf_model= TfidfVectorizer(analyzer=clean_text)
tfidf_fit_model = tfidf_model.fit(X_train["experience_with_device"])
tfidf_train = tfidf_fit_model.transform(X_train["experience_with_device"])
tfidf_test = tfidf_fit_model.transform(X_test["experience_with_device"])

In [72]:
### now combine the vectorized data with the existing features for the train and test data

X_train_vect = pd.concat([X_train[["user_interface","overall_experience"]].reset_index(drop=True),pd.DataFrame(tfidf_train.toarray())], axis=1 )
X_test_vect = pd.concat([X_test[["user_interface","overall_experience"]].reset_index(drop=True),pd.DataFrame(tfidf_test.toarray())], axis=1 )

In [74]:
X_train_vect.head()

Unnamed: 0,user_interface,overall_experience,0,1,2,3,4,5,6,7,...,175,176,177,178,179,180,181,182,183,184
0,10,10,0.0,0.0,0.0,0.0,0.537911,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,5,0.0,0.0,0.295423,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9,8,0.0,0.0,0.0,0.0,0.0,0.0,0.503253,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,7,0.470235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72353
4,7,7,0.323266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. perform grid search CV for the hyperparameter tuning!

#### 3.1 count vectorization

In [75]:
### It builds models with various values to the hyper parameters as input



from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv_model = CountVectorizer(analyzer=clean_text)
cv_model_vect = cv_model.fit_transform(review_data["experience_with_device"])


In [76]:
cv_vect_df = pd.DataFrame(cv_model_vect.toarray())

In [77]:
cv_vect_df.columns =cv_model.get_feature_names()

In [78]:
### now combine the features with the vector dataframe

cv_dataframe = pd.concat([review_data[["user_interface","overall_experience"]].reset_index(drop=True), cv_vect_df], axis=1)

In [79]:
cv_dataframe.head(2)

Unnamed: 0,user_interface,overall_experience,Unnamed: 3,10,I,Im,It,NO,No,abl,...,wast,way,weak,week,well,wish,work,worst,worth,wrong
0,8,8,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,8,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
from sklearn.model_selection import train_test_split

X_traincv, X_testcv, Y_traincv, Y_testcv = train_test_split(cv_dataframe,review_data["label"], test_size = 0.2)


#### 3.2 Random forest classifier

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_cv_model = RandomForestClassifier()
params = {"n_estimators":[10,30,50,100,150], "max_depth":[None, 20, 60, 120, 200]}
Grid_cv_model = GridSearchCV(rf_cv_model, params, cv= 5, n_jobs=-1)

Grid_cv_model_fit = Grid_cv_model.fit(X_traincv, Y_traincv)

In [82]:
res1 = pd.DataFrame(Grid_cv_model_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

In [83]:
res1.to_csv('gridsearch_count_results.csv',index=False)

In [84]:
Y_pred = Grid_cv_model_fit.predict(X_testcv)

#### 3.3 Gradient Boosting 

In [85]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

GB_cv_model = GradientBoostingClassifier()
params = {"n_estimators":[10,30,50,100,150], "max_depth":[None, 20, 60, 120, 200]}
Grid_cv_model = GridSearchCV(GB_cv_model, params, cv= 5, n_jobs=-1)

Grid_cv_model_fit = Grid_cv_model.fit(X_traincv, Y_traincv)

In [86]:
res1 = pd.DataFrame(Grid_cv_model_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

In [87]:
res1.to_csv('GB_gridsearch_count_results.csv',index=False)

#### 3.2  TF-IDF vectorization

In [88]:

from sklearn.feature_extraction.text import  TfidfVectorizer

tfidf_model = CountVectorizer(analyzer=clean_text)
tfidf_model_vect = tfidf_model.fit_transform(review_data["experience_with_device"])

In [89]:
tfidf_vect_df = pd.DataFrame(tfidf_model_vect.toarray())

In [90]:
tfidf_vect_df.columns =tfidf_model.get_feature_names()

In [91]:
### now combine the features with the vector dataframe

tfidf_dataframe = pd.concat([review_data[["user_interface","overall_experience"]].reset_index(drop=True), tfidf_vect_df], axis=1)

In [92]:
tfidf_dataframe.head(2)

Unnamed: 0,user_interface,overall_experience,Unnamed: 3,10,I,Im,It,NO,No,abl,...,wast,way,weak,week,well,wish,work,worst,worth,wrong
0,8,8,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,8,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
from sklearn.model_selection import train_test_split

X_traincv, X_testcv, Y_traincv, Y_testcv = train_test_split(tfidf_dataframe,review_data["label"], test_size = 0.2)


#### Random forest 

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_tfidf_model = RandomForestClassifier()
params = {"n_estimators":[10,30,50,100,150], "max_depth":[None, 20, 60, 120, 200]}
Grid_tfidf_model = GridSearchCV(rf_tfidf_model, params, cv= 5, n_jobs=-1)

Grid_tfidf_model_fit = Grid_tfidf_model.fit(X_traincv, Y_traincv)

In [95]:
res2 = pd.DataFrame(Grid_tfidf_model_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

In [96]:
res2.to_csv('gridsearch_tfidf_results.csv',index=False)

#### Gradient Boosting

In [97]:

GB_tfidf_model = GradientBoostingClassifier()
params = {"n_estimators":[10,30,50,100,150], "max_depth":[None, 20, 60, 120, 200]}
Grid_tfidf_model = GridSearchCV(GB_tfidf_model, params, cv= 5, n_jobs=-1)

Grid_tfidf_model_fit = Grid_tfidf_model.fit(X_traincv, Y_traincv)

In [98]:
res2 = pd.DataFrame(Grid_tfidf_model_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

In [99]:
res2.to_csv('GB_gridsearch_tfidf_results.csv',index=False)

## The Final Random Forest Model 

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

In [101]:
import time

In [108]:
rf_model = RandomForestClassifier(n_estimators=30, max_depth=None)
fit_time = time.time()
rf_model.fit(X_train_vect, Y_train)
fit_end_time= time.time()
total = fit_end_time - fit_end_time

pred_time = time.time()
y_pred = rf_model.predict(X_test_vect)
end_time = time.time()
pred_total = end_time - pred_time

precision,recall,fscore,support = precision_recall_fscore_support(Y_test, y_pred, pos_label="negative", average ="binary")

print("fit-time {}, pred-time {}, precision {}, recall {}".format(round(total,3), round(pred_total,3), round(precision,3), round(recall,2)))

fit-time 0.0, pred-time 0.007, precision 1.0, recall 0.89


## The Final Gradiant Boosting

In [42]:
from sklearn.ensemble import GradientBoostingClassifier

In [110]:
grad_model = GradientBoostingClassifier(max_depth= None, n_estimators=10 )
fit_time = time.time()
grad_model.fit(X_train_vect, Y_train)
fit_end_time= time.time()
total = fit_end_time - fit_end_time

pred_time = time.time()
y_pred = grad_model.predict(X_test_vect)
end_time = time.time()
pred_total = end_time - pred_time

precision,recall,fscore,support = precision_recall_fscore_support(Y_test, y_pred, pos_label="negative", average ="binary")

print("fit-time {}, pred-time {}, precision {}, recall {}".format(round(total,3), round(pred_total,3), round(precision,3), round(recall,2)))

fit-time 0.0, pred-time 0.003, precision 1.0, recall 0.67


## GridSearch CV for the hyper parameter tuning

In [49]:
from sklearn.model_selection import GridSearchCV

In [52]:
params = {'n_estimators':[10,30,50,100], 'max_depth':[None,10,50,70,100]}
rf_model_2 = RandomForestClassifier()
grid_rf_model = GridSearchCV(rf_model_2,params,cv=5, n_jobs=-1)

grid_rf_fit = grid_rf_model.fit(X_train_vect, Y_train)


In [57]:
pd.DataFrame(grid_rf_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.029426,0.001356,0.004928,0.004764,,10,"{'max_depth': None, 'n_estimators': 10}",0.9375,0.8125,0.9375,1.0,0.933333,0.924167,0.06109,1
16,0.030662,0.002118,0.004399,0.001427,100.0,10,"{'max_depth': 100, 'n_estimators': 10}",0.9375,0.8125,0.8125,0.9375,0.933333,0.886667,0.060576,2
4,0.029636,0.003866,0.005866,0.000437,10.0,10,"{'max_depth': 10, 'n_estimators': 10}",0.875,0.875,0.875,0.875,0.933333,0.886667,0.023333,2
8,0.027573,0.004715,0.008021,0.002833,50.0,10,"{'max_depth': 50, 'n_estimators': 10}",0.875,0.8125,0.875,1.0,0.866667,0.885833,0.061667,4
11,0.24989,0.011335,0.018764,0.00367,50.0,100,"{'max_depth': 50, 'n_estimators': 100}",0.875,0.875,0.8125,1.0,0.866667,0.885833,0.061667,4
