In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import linear_model

### Data Cleaning

In [2]:
# data cleaning

# read data
d = pd.read_csv('df_eng.csv', index_col = 0)
product = list(d['phone_url'].str.split('/'))


# parse brand and model from phont_url and add back to the dataframe
product2 = []
for i in product:
    product2.append(i[2])
    
product2_split = []
for p in product2:
    product2_split.append(p.split('-'))

brands = []
for p in product2_split:
    brands.append(p[0])
    
d['brand'] = brands

models = []
for p in product2_split:
    models.append(p[1])
    
d['models'] = models


# delete unused column
del d['lang']
del d['country']
del d['domain']
del d['score_max']
del d['author']
del d['product']


# drop columns with nan score
d.dropna(subset=['score'], inplace = True)
d.dropna(subset = ['extract'], inplace = True)

# lowercase extract text
d['extract'] = d.apply(lambda row: str(row['extract']).lower(),axis=1)

In [3]:
# d.to_csv('data_cleaned.csv')
d.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy


In [4]:
# sample 100,000 reviews
random.seed(1)
d = d.sample(100000)

### Tokenization and Lemmatization

In [5]:
# tokenization and lemmatization function
def token_lemma(df,col):
    # tokenize by spacy
    nlp = spacy.load('en')
    df['token'] = df[col].apply(lambda x: nlp(x))
    # lemmatize by spacy
    df['lemmatized'] = df['token'].apply(lambda x: " ".join([token.lemma_ if token.lemma_ != "-PRON-" else token.text for token in x]))
    return df

In [6]:
d['extract_str'] = d.apply(lambda row: str(row['extract']),axis=1)

In [7]:
d.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models,extract_str
648897,/cellphones/apple-iphone-5s/,1/3/2014,Amazon,2.0,nonsense wicked people,apple,iphone,nonsense wicked people
1474804,/cellphones/motorola-razr-v3/,6/23/2013,Amazon,2.0,this order was a huge mistake. would have veri...,motorola,razr,this order was a huge mistake. would have veri...
1080685,/cellphones/nokia-asha-300/,6/6/2012,Amazon,10.0,"it works well, is simple to use, especially if...",nokia,asha,"it works well, is simple to use, especially if..."
626988,/cellphones/lenovo-vibe-k5/,7/13/2016,Amazon,10.0,phone works with full satisfaction including o...,lenovo,vibe,phone works with full satisfaction including o...
13546,/cellphones/samsung-galaxy-s7-edge/,3/26/2016,Samsung,10.0,the galaxy s7 edge is by far the best phone i ...,samsung,galaxy,the galaxy s7 edge is by far the best phone i ...


In [8]:
# lemmatization
d_lemma = token_lemma(d, 'extract_str')

In [9]:
d_lemma.head(10)

Unnamed: 0,phone_url,date,source,score,extract,brand,models,extract_str,token,lemmatized
648897,/cellphones/apple-iphone-5s/,1/3/2014,Amazon,2.0,nonsense wicked people,apple,iphone,nonsense wicked people,"(nonsense, wicked, people)",nonsense wicked people
1474804,/cellphones/motorola-razr-v3/,6/23/2013,Amazon,2.0,this order was a huge mistake. would have veri...,motorola,razr,this order was a huge mistake. would have veri...,"(this, order, was, a, huge, mistake, ., would,...",this order be a huge mistake . would have veri...
1080685,/cellphones/nokia-asha-300/,6/6/2012,Amazon,10.0,"it works well, is simple to use, especially if...",nokia,asha,"it works well, is simple to use, especially if...","(it, works, well, ,, is, simple, to, use, ,, e...","it work well , be simple to use , especially i..."
626988,/cellphones/lenovo-vibe-k5/,7/13/2016,Amazon,10.0,phone works with full satisfaction including o...,lenovo,vibe,phone works with full satisfaction including o...,"(phone, works, with, full, satisfaction, inclu...",phone work with full satisfaction include otg ...
13546,/cellphones/samsung-galaxy-s7-edge/,3/26/2016,Samsung,10.0,the galaxy s7 edge is by far the best phone i ...,samsung,galaxy,the galaxy s7 edge is by far the best phone i ...,"(the, galaxy, s7, edge, is, by, far, the, best...",the galaxy s7 edge be by far the good phone i ...
288404,/cellphones/samsung-galaxy-note-4/,12/15/2014,Amazon,8.0,i would have given this 5 stars if it had arri...,samsung,galaxy,i would have given this 5 stars if it had arri...,"(i, would, have, given, this, 5, stars, if, it...",i would have give this 5 star if it have arriv...
1498840,/cellphones/nokia-3100/,10/23/2004,Phone Scoop,9.0,"i've had this phone for about three weeks now,...",nokia,3100,"i've had this phone for about three weeks now,...","(i, 've, had, this, phone, for, about, three, ...",i have have this phone for about three week no...
552011,/cellphones/lg-g2-594708/,12/4/2013,Amazon,8.0,recieved on a time line and very happy with th...,lg,g2,recieved on a time line and very happy with th...,"(recieved, on, a, time, line, and, very, happy...",reciev on a time line and very happy with the ...
1518083,/cellphones/audiovox-cdm-9155gpx-cdm9155sp/,6/23/2004,Phone Scoop,8.0,"i used this phone for the past two years, and ...",audiovox,cdm,"i used this phone for the past two years, and ...","(i, used, this, phone, for, the, past, two, ye...","i use this phone for the past two year , and m..."
387150,/cellphones/samsung-galaxy-j1/,10/22/2015,Amazon,10.0,this phone has been great. i've had it for mor...,samsung,galaxy,this phone has been great. i've had it for mor...,"(this, phone, has, been, great, ., i, 've, had...",this phone have be great . i have have it for ...


### Vectorization

In [10]:
# vectorization
stopwords2 = stopwords.words() + ['phone', 'phones', 'cell', 'mobile']

vectorizer = TfidfVectorizer(ngram_range = (2, 3),
                             token_pattern = r'\b[a-zA-Z]{3,}\b', # detect text with three or more alphanumeric words
                             max_df = 0.4,
                             min_df = 2,
                             stop_words = stopwords2,
                             max_features = 200 # keep top 200 features
                            )

In [11]:
d_lemma2 = d_lemma.reset_index(drop = True)

In [12]:
# vectorization
corpus = list(d_lemma2["lemmatized"].values)
X = vectorizer.fit_transform(corpus)
features = X.toarray()
d_formodel=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
d_formodel['TARGET'] = d_lemma2['score']  # adding target to the data frame
d_formodel.index = d_lemma2['extract'] # adding review to the data frame for future review

  'stop_words.' % sorted(inconsistent))


### Split Train and Test Sets

In [13]:
# feature selection and modeling - LASSO

# split train, test sets
train_df, test_df = train_test_split(d_formodel)

X_train = train_df.loc[:, ~train_df.columns.isin(['TARGET'])] # remove target
X_test = test_df.loc[:, ~test_df.columns.isin(['TARGET'])]

y_train = train_df['TARGET'] # get target
y_test = test_df['TARGET']

### Baseline (mean)

In [14]:
# baseline (mean)
np.mean(d['score'])

7.681789999999892

In [15]:
# baseline train RMSE
baseline_rmse_train = pd.DataFrame(train_df['TARGET'])
baseline_rmse_train['predict'] = np.mean(d['score'])
baseline_rmse_train['rmse'] = (baseline_rmse_train['TARGET'] - baseline_rmse_train['predict']) ** 2
(sum(baseline_rmse_train['rmse'])/ len(baseline_rmse_train)) ** 1/2   #RMSE

4.1855039474188525

In [16]:
# train set R2
round(r2_score(y_true = baseline_rmse_train['TARGET'], y_pred = baseline_rmse_train['predict']), 4)

-0.0

In [17]:
# baseline test RMSE
baseline_rmse_test = pd.DataFrame(test_df['TARGET'])
baseline_rmse_test['predict'] = np.mean(d['score'])
baseline_rmse_test['rmse'] = (baseline_rmse_test['TARGET'] - baseline_rmse_test['predict']) ** 2
(sum(baseline_rmse_test['rmse'])/ len(baseline_rmse_test)) ** 1/2   #RMSE

4.132450149530471

In [18]:
# train set R2
round(r2_score(y_true = baseline_rmse_test['TARGET'], y_pred = baseline_rmse_test['predict']), 4)

-0.0

### Linear Regression

In [41]:
# Linear regression 
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)
coef_lr = pd.Series(lr.coef_, index = X_train.columns)

In [42]:
# predict for train set
y_pred = lr.predict(X_train)

linear_rmse_train = pd.DataFrame(train_df['TARGET'])
linear_rmse_train['predict'] = y_pred
linear_rmse_train['rmse'] = (linear_rmse_train['TARGET'] - linear_rmse_train['predict']) ** 2
(sum(linear_rmse_train['rmse'])/ len(linear_rmse_train)) ** 1/2   #RMSE

3.5751940749143327

In [228]:
# train set R2
r2_lr_train = r2_score(y_true = linear_rmse_train['TARGET'], y_pred = linear_rmse_train['predict'])
r2_lr_train

0.14581505803425288

In [237]:
# train set adj R2
adj_r2_lr_train = 1 - (1 - r2_lr_train) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))
adj_r2_lr_train

0.14353111054306789

In [45]:
# predict for test set
y_pred = lr.predict(X_test)

linear_rmse_test = pd.DataFrame(test_df['TARGET'])
linear_rmse_test['predict'] = y_pred
linear_rmse_test['rmse'] = (linear_rmse_test['TARGET'] - linear_rmse_test['predict']) ** 2
(sum(linear_rmse_test['rmse'])/ len(linear_rmse_test)) ** 1/2   #RMSE

3.5518722078156406

In [231]:
# test set R2
r2_lr_test = r2_score(y_true = linear_rmse_test['TARGET'], y_pred = linear_rmse_test['predict'])
r2_lr_test

0.1404916106051738

In [238]:
# test set adj R2
adj_r2_lr_test = 1 - (1 - r2_lr_test) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))
adj_r2_lr_test

0.13819342910703925

### LASSO Regression with alpha = 0.001

In [257]:
# LASSO model with alpha = 0.001
# fit model to train
reg = Lasso(alpha = 0.001) 
reg.fit(X_train, y_train)
coef_lasso = pd.Series(reg.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_lasso != 0)) + " variables and eliminated the other " +  str(sum(coef_lasso == 0)) + " variables")

Lasso picked 150 variables and eliminated the other 50 variables


In [251]:
# predict for train set
y_pred = reg.predict(X_train)

lasso_rmse_train = pd.DataFrame(train_df['TARGET'])
lasso_rmse_train['predict'] = y_pred
lasso_rmse_train['rmse'] = (lasso_rmse_train['TARGET'] - lasso_rmse_train['predict']) ** 2
(sum(lasso_rmse_train['rmse'])/ len(lasso_rmse_train)) ** 1/2   #RMSE

3.6099357773052203

In [252]:
# train set R2
r2_reg_train = r2_score(y_true = lasso_rmse_train['TARGET'], y_pred = lasso_rmse_train['predict'])
r2_reg_train

0.13751457464265082

In [253]:
# train set adj R2
adj_r2_reg_train = 1 - (1 - r2_reg_train) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))
adj_r2_reg_train

0.1352084330488933

In [254]:
# predict for test set
y_pred = reg.predict(X_test)

lasso_rmse_test = pd.DataFrame(test_df['TARGET'])
lasso_rmse_test['predict'] = y_pred
lasso_rmse_test['rmse'] = (lasso_rmse_test['TARGET'] - lasso_rmse_test['predict']) ** 2
(sum(lasso_rmse_test['rmse'])/ len(lasso_rmse_test)) ** 1/2   #RMSE

3.5754581894215813

In [255]:
# test set R2
r2_reg_test = r2_score(y_true = lasso_rmse_test['TARGET'], y_pred = lasso_rmse_test['predict'])
r2_reg_test

0.13478409978381423

In [256]:
# test set adj R2
adj_r2_reg_test = 1 - (1 - r2_reg_test) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))
adj_r2_reg_test

0.13247065735753538

### LASSO Regression Cross Validation Model

In [99]:
# LASSO CV model
# fit model to train
reg_cv = LassoCV(cv = 5)
reg_cv.fit(X_train, y_train)
coef_regcv = pd.Series(reg_cv.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_regcv != 0)) + " variables and eliminated the other " +  str(sum(coef_regcv == 0)) + " variables")

Lasso picked 198 variables and eliminated the other 2 variables


In [56]:
# predict for train set
y_pred = reg_cv.predict(X_train)

lassocv_rmse_train = pd.DataFrame(train_df['TARGET'])
lassocv_rmse_train['predict'] = y_pred
lassocv_rmse_train['rmse'] = (lassocv_rmse_train['TARGET'] - lassocv_rmse_train['predict']) ** 2
(sum(lassocv_rmse_train['rmse'])/ len(lassocv_rmse_train)) ** 1/2   # RMSE

3.5753190200605016

In [57]:
# train set R2
r2_regcv_train = r2_score(y_true = lassocv_rmse_train['TARGET'], y_pred = lassocv_rmse_train['predict'])
r2_regcv_train

0.1457852061545145

In [241]:
# train set adj R2
adj_r2_regcv_train = 1 - (1 - r2_regcv_train) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))
adj_r2_regcv_train

0.14350117884440217

In [59]:
# predict for test set
y_pred = reg_cv.predict(X_test)

lassocv_rmse_test = pd.DataFrame(test_df['TARGET'])
lassocv_rmse_test['predict'] = y_pred
lassocv_rmse_test['rmse'] = (lassocv_rmse_test['TARGET'] - lassocv_rmse_test['predict']) ** 2
(sum(lassocv_rmse_test['rmse'])/ len(lassocv_rmse_test)) ** 1/2   # RMSE

3.5513581652835553

In [60]:
# test set R2
r2_regcv_test = r2_score(y_true = lassocv_rmse_test['TARGET'], y_pred = lassocv_rmse_test['predict'])
r2_regcv_test

0.14061600243096084

In [242]:
# test set adj R2
adj_r2_regcv_test = 1 - (1 - r2_regcv_test) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))
adj_r2_regcv_test

0.1383181535357375

### Coefficients of Best Model (LASSO CV)

In [107]:
# most negative mentioned
coef_regcv_asc = coef_regcv.sort_values()
coef_regcv_asc[:50]

bad ever            -5.572641
waste money         -4.850061
stop work           -4.207888
work properly       -3.737942
send back           -3.647684
two month           -2.430124
battery drain       -2.374894
month use           -2.143992
get hot             -2.079389
drop call           -1.953762
could get           -1.852234
heating problem     -1.828147
one month           -1.780048
could use           -1.626493
one day             -1.524536
buy new             -1.428890
make call           -1.329083
text message        -1.140568
buy another         -1.131613
sim card            -1.075697
time get            -1.047834
back cover          -0.982143
look like           -0.879413
last month          -0.829493
two week            -0.815823
internal memory     -0.811672
first get           -0.716438
first time          -0.709179
month ago           -0.662787
give star           -0.648772
one year            -0.618881
touch screen        -0.609667
day use             -0.580879
get new   

In [108]:
# most positive mentioned
coef_regcv_desc = coef_regcv.sort_values(ascending = False)
coef_regcv_desc[:50]

excellent product    2.393180
good ever            2.314092
love new             2.247280
highly recommend     2.087740
absolutely love      2.071476
great product        2.050072
love love            2.044664
everything need      1.995682
super fast           1.993144
good smartphone      1.951325
good iphone          1.942708
work perfectly       1.939957
great love           1.909319
love everything      1.906491
easy set             1.905297
happy purchase       1.898938
great price          1.859615
great buy            1.850995
easy use             1.794416
great value          1.757412
work great           1.737793
love samsung         1.717580
galaxy edge          1.636879
large screen         1.623331
far good             1.619593
one good             1.583589
value money          1.578040
nice product         1.577375
light weight         1.567261
good android         1.561473
big screen           1.557646
really love          1.510675
great good           1.509624
great feat

### Test Real Reviews

In [219]:
# test reviews data processing
test = pd.read_excel('test_review.xlsx', sheet_name = 'Sheet1')
test['review'] = test.apply(lambda row: str(row['review']).lower(),axis=1)
test['review_str'] = test.apply(lambda row: str(row['review']),axis=1)

test_lemma = token_lemma(test, 'review_str')

stopwords2 = stopwords.words() + ['phone', 'phones', 'cell', 'mobile']

vectorizer = TfidfVectorizer(ngram_range = (2, 3),
                             token_pattern = r'\b[a-zA-Z]{3,}\b', # detect text with three or more alphanumeric words
                             stop_words = stopwords2,
                             max_features = 200 # keep top 200 features
                            )

test_lemma2 = test_lemma.reset_index(drop = True)

# vectorization
corpus = list(test_lemma2["lemmatized"].values)
X = vectorizer.fit_transform(corpus)
features = X.toarray()
test_formodel=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
test_formodel['TARGET'] = test_lemma2['rating']  # adding target to the data frame

testset = test_formodel.loc[:, ~test_formodel.columns.isin(['TARGET'])]

  'stop_words.' % sorted(inconsistent))


In [220]:
# predict select reivews scores
test_pred = reg_cv.predict(testset)
test_pred

array([ 7.96371968, 11.36874163,  8.36879719,  9.95759984,  9.36515668,
        8.44740099,  8.92921475,  7.13220545,  7.22802914,  7.71936947,
        2.59484013,  9.56424799,  9.13733315])

In [221]:
test['predict'] = test_pred
test[['review', 'rating', 'predict']]

Unnamed: 0,review,rating,predict
0,defective it doesn't take charge at all i was ...,2,7.96372
1,"hello, my name is victor and i am a student. i...",8,11.368742
2,phone is cheaply made and too big. last apple ...,2,8.368797
3,do not buy! i’ve been an active frequent user ...,2,9.9576
4,bought this iphone to upgrade from our iphone ...,8,9.365157
5,if you like iphones this is a could choice. i'...,10,8.447401
6,disappointed that i spent money for an expensi...,4,8.929215
7,the excellent experience of iphone brings me g...,10,7.132205
8,didn’t readily see you can only use a certain ...,6,7.228029
9,repackaged with sticky cellophane that left ad...,2,7.719369


We tried to use our model in the real world settings. Howevert, we found the model's accuracy is not good because of lack of data entry, which lead to poor n-grams / tokens. We think more data is needed to get accurate prediction for the model.