In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import linear_model

### Data Cleaning

In [2]:
# data cleaning

# read data
d = pd.read_csv('df_eng.csv', index_col = 0)
product = list(d['phone_url'].str.split('/'))


# parse brand and model from phont_url and add back to the dataframe
product2 = []
for i in product:
    product2.append(i[2])
    
product2_split = []
for p in product2:
    product2_split.append(p.split('-'))

brands = []
for p in product2_split:
    brands.append(p[0])
    
d['brand'] = brands

models = []
for p in product2_split:
    models.append(p[1])
    
d['models'] = models


# delete unused column
del d['lang']
del d['country']
del d['domain']
del d['score_max']
del d['author']
del d['product']


# drop columns with nan score
d.dropna(subset=['score'], inplace = True)
d.dropna(subset = ['extract'], inplace = True)

# lowercase extract text
d['extract'] = d.apply(lambda row: str(row['extract']).lower(),axis=1)

In [3]:
# d.to_csv('data_cleaned.csv')
d.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy


### Tokenization and Lemmatization

In [4]:
# tokenization and lemmatization function
def token_lemma(df,col):
    # tokenize by spacy
    nlp = spacy.load('en')
    df['token'] = df[col].apply(lambda x: nlp(x))
    # lemmatize by spacy
    df['lemmatized'] = df['token'].apply(lambda x: " ".join([token.lemma_ if token.lemma_ != "-PRON-" else token.text for token in x]))
    return df

In [5]:
d['extract_str'] = d.apply(lambda row: str(row['extract']),axis=1)

In [6]:
d.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models,extract_str
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy,as a diehard samsung fan who has had every sam...
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy,love the phone. the phone is sleek and smooth ...
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy,adequate feel. nice heft. processor's still sl...
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy,never disappointed. one of the reasons i've be...
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy,i've now found that i'm in a group of people t...


In [7]:
# lemmatization
d_lemma = token_lemma(d, 'extract_str')

In [8]:
d_lemma.head(10)

Unnamed: 0,phone_url,date,source,score,extract,brand,models,extract_str,token,lemmatized
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy,as a diehard samsung fan who has had every sam...,"(as, a, diehard, samsung, fan, who, has, had, ...",as a diehard samsung fan who have have every s...
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy,love the phone. the phone is sleek and smooth ...,"(love, the, phone, ., the, phone, is, sleek, a...",love the phone . the phone be sleek and smooth...
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy,adequate feel. nice heft. processor's still sl...,"(adequate, feel, ., nice, heft, ., processor, ...",adequate feel . nice heft . processor 's still...
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy,never disappointed. one of the reasons i've be...,"(never, disappointed, ., one, of, the, reasons...",never disappoint . one of the reason i have be...
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy,i've now found that i'm in a group of people t...,"(i, 've, now, found, that, i, 'm, in, a, group...",i have now find that i be in a group of people...
5,/cellphones/samsung-galaxy-s8/,5/10/2017,Verizon Wireless,10.0,i am the type of person who never would comple...,samsung,galaxy,i am the type of person who never would comple...,"(i, am, the, type, of, person, who, never, wou...",i be the type of person who never would comple...
6,/cellphones/samsung-galaxy-s8/,5/10/2017,Verizon Wireless,4.0,the way this samsung s8 phone operates is more...,samsung,galaxy,the way this samsung s8 phone operates is more...,"(the, way, this, samsung, s8, phone, operates,...",the way this samsung s8 phone operate be more ...
7,/cellphones/samsung-galaxy-s8/,5/10/2017,Verizon Wireless,6.0,i bought this phone very excited to use it. i ...,samsung,galaxy,i bought this phone very excited to use it. i ...,"(i, bought, this, phone, very, excited, to, us...",i buy this phone very excited to use it . i ag...
8,/cellphones/samsung-galaxy-s8/,5/10/2017,Verizon Wireless,6.0,it is an extremely advanced and truly a smart ...,samsung,galaxy,it is an extremely advanced and truly a smart ...,"(it, is, an, extremely, advanced, and, truly, ...",it be an extremely advanced and truly a smart ...
9,/cellphones/samsung-galaxy-s8/,5/10/2017,Verizon Wireless,8.0,"great phone with a phenomenal camera, not all ...",samsung,galaxy,"great phone with a phenomenal camera, not all ...","(great, phone, with, a, phenomenal, camera, ,,...","great phone with a phenomenal camera , not all..."


### Vectorization

In [9]:
# vectorization
stopwords2 = stopwords.words() + ['phone', 'phones', 'cell', 'mobile']

vectorizer = TfidfVectorizer(ngram_range = (2, 3),
                             token_pattern = r'\b[a-zA-Z]{3,}\b', # detect text with three or more alphanumeric words
                             max_df = 0.4,
                             min_df = 2,
                             stop_words = stopwords2,
                             max_features = 200 # keep top 200 features
                            )

In [10]:
d_lemma2 = d_lemma.reset_index(drop = True)

In [11]:
# vectorization
corpus = list(d_lemma2["lemmatized"].values)
X = vectorizer.fit_transform(corpus)
features = X.toarray()
d_formodel=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
d_formodel['TARGET'] = d_lemma2['score']  # adding target to the data frame
d_formodel.index = d_lemma2['extract'] # adding review to the data frame for future review

  'stop_words.' % sorted(inconsistent))


### Split Train and Test Sets

In [12]:
# feature selection and modeling - LASSO

# split train, test sets
train_df, test_df = train_test_split(d_formodel)

X_train = train_df.loc[:, ~train_df.columns.isin(['TARGET'])] # remove target
X_test = test_df.loc[:, ~test_df.columns.isin(['TARGET'])]

y_train = train_df['TARGET'] # get target
y_test = test_df['TARGET']

### Baseline (mean)

In [13]:
# baseline (mean)
np.mean(d['score'])

7.682604457456658

In [14]:
# baseline train RMSE
baseline_rmse_train = pd.DataFrame(train_df['TARGET'])
baseline_rmse_train['predict'] = np.mean(d['score'])
baseline_rmse_train['rmse'] = (baseline_rmse_train['TARGET'] - baseline_rmse_train['predict']) ** 2
(sum(baseline_rmse_train['rmse'])/ len(baseline_rmse_train)) ** 1/2   #RMSE

4.162747676810994

In [15]:
# train set R2
round(r2_score(y_true = baseline_rmse_train['TARGET'], y_pred = baseline_rmse_train['predict']), 4)

-0.0

In [16]:
# baseline test RMSE
baseline_rmse_test = pd.DataFrame(test_df['TARGET'])
baseline_rmse_test['predict'] = np.mean(d['score'])
baseline_rmse_test['rmse'] = (baseline_rmse_test['TARGET'] - baseline_rmse_test['predict']) ** 2
(sum(baseline_rmse_test['rmse'])/ len(baseline_rmse_test)) ** 1/2   #RMSE

4.174465773844156

In [17]:
# train set R2
round(r2_score(y_true = baseline_rmse_test['TARGET'], y_pred = baseline_rmse_test['predict']), 4)

-0.0

### Linear Regression

In [18]:
# Linear regression 
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)
coef_lr = pd.Series(lr.coef_, index = X_train.columns)

In [19]:
# predict for train set
y_pred = lr.predict(X_train)

linear_rmse_train = pd.DataFrame(train_df['TARGET'])
linear_rmse_train['predict'] = y_pred
linear_rmse_train['rmse'] = (linear_rmse_train['TARGET'] - linear_rmse_train['predict']) ** 2
(sum(linear_rmse_train['rmse'])/ len(linear_rmse_train)) ** 1/2   #RMSE

3.569599218690756

In [20]:
# train set R2
r2_score(y_true = linear_rmse_train['TARGET'], y_pred = linear_rmse_train['predict'])

0.142489589451418

In [21]:
# predict for test set
y_pred = lr.predict(X_test)

linear_rmse_test = pd.DataFrame(test_df['TARGET'])
linear_rmse_test['predict'] = y_pred
linear_rmse_test['rmse'] = (linear_rmse_test['TARGET'] - linear_rmse_test['predict']) ** 2
(sum(linear_rmse_test['rmse'])/ len(linear_rmse_test)) ** 1/2   #RMSE

3.5835735973119425

In [22]:
# test set R2
r2_score(y_true = linear_rmse_test['TARGET'], y_pred = linear_rmse_test['predict'])

0.14154863167969656

### LASSO Regression with alpha = 0.005

In [23]:
# LASSO model with alpha = 0.005
# fit model to train
reg = Lasso(alpha = 0.005) 
reg.fit(X_train, y_train)
coef_lasso = pd.Series(reg.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_lasso != 0)) + " variables and eliminated the other " +  str(sum(coef_lasso == 0)) + " variables")

Lasso picked 35 variables and eliminated the other 165 variables


In [24]:
# predict for train set
y_pred = reg.predict(X_train)

lasso_rmse_train = pd.DataFrame(train_df['TARGET'])
lasso_rmse_train['predict'] = y_pred
lasso_rmse_train['rmse'] = (lasso_rmse_train['TARGET'] - lasso_rmse_train['predict']) ** 2
(sum(lasso_rmse_train['rmse'])/ len(lasso_rmse_train)) ** 1/2   #RMSE

3.838993267543084

In [25]:
# train set R2
r2_score(y_true = lasso_rmse_train['TARGET'], y_pred = lasso_rmse_train['predict'])

0.07777414458753695

In [26]:
# predict for test set
y_pred = reg.predict(X_test)

lasso_rmse_test = pd.DataFrame(test_df['TARGET'])
lasso_rmse_test['predict'] = y_pred
lasso_rmse_test['rmse'] = (lasso_rmse_test['TARGET'] - lasso_rmse_test['predict']) ** 2
(sum(lasso_rmse_test['rmse'])/ len(lasso_rmse_test)) ** 1/2   #RMSE

3.848641332587018

In [27]:
# test set R2
r2_score(y_true = lasso_rmse_test['TARGET'], y_pred = lasso_rmse_test['predict'])

0.07805118873151573

### LASSO Regression Cross Validation Model

In [28]:
# LASSO CV model
# fit model to train
reg_cv = LassoCV()
reg_cv.fit(X_train, y_train)
coef_regcv = pd.Series(reg_cv.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_regcv != 0)) + " variables and eliminated the other " +  str(sum(coef_regcv == 0)) + " variables")



Lasso picked 199 variables and eliminated the other 1 variables


In [29]:
# predict for train set
y_pred = reg_cv.predict(X_train)

lassocv_rmse_train = pd.DataFrame(train_df['TARGET'])
lassocv_rmse_train['predict'] = y_pred
lassocv_rmse_train['rmse'] = (lassocv_rmse_train['TARGET'] - lassocv_rmse_train['predict']) ** 2
(sum(lassocv_rmse_train['rmse'])/ len(lassocv_rmse_train)) ** 1/2   # RMSE

3.5696333798957722

In [30]:
# train set R2
r2_score(y_true = lassocv_rmse_train['TARGET'], y_pred = lassocv_rmse_train['predict'])

0.14248138304452773

In [31]:
# predict for test set
y_pred = reg_cv.predict(X_test)

lassocv_rmse_test = pd.DataFrame(test_df['TARGET'])
lassocv_rmse_test['predict'] = y_pred
lassocv_rmse_test['rmse'] = (lassocv_rmse_test['TARGET'] - lassocv_rmse_test['predict']) ** 2
(sum(lassocv_rmse_test['rmse'])/ len(lassocv_rmse_test)) ** 1/2   # RMSE

3.5836121207615004

In [32]:
# train set R2
r2_score(y_true = lassocv_rmse_test['TARGET'], y_pred = lassocv_rmse_test['predict'])

0.14153940331862858

### Coefficients of Best Model (LASSO CV)

In [44]:
coef_lr_desc = coef_lr.sort_values(ascending = False)
coef_lr_desc[:20]

excellent product    2.452621
good ever            2.322421
highly recommend     2.229287
love new             2.177848
great product        2.101927
absolutely love      2.079232
super fast           2.059029
love love            2.032607
everything need      2.003075
work perfectly       1.963291
good iphone          1.957857
good smartphone      1.929496
great love           1.905807
great price          1.830238
happy purchase       1.809032
great buy            1.799091
love everything      1.791737
easy use             1.779209
love samsung         1.747154
nice product         1.734897
dtype: float64

In [45]:
coef_lr_aesc = coef_lr.sort_values(ascending = True)
coef_lr_aesc[:20]

bad ever          -5.547389
waste money       -4.918649
stop work         -4.070216
send back         -3.849688
work properly     -3.470201
buy product       -2.738116
hold charge       -2.624118
battery drain     -2.263830
every time        -2.211277
drop call         -2.186340
two month         -2.013661
one month         -1.833336
heating problem   -1.769162
month use         -1.704547
could get         -1.514371
buy new           -1.499779
one day           -1.346494
make call         -1.315271
text message      -1.309045
sim card          -1.205412
dtype: float64

In [34]:
# most negative mentioned
coef_regcv_asc = coef_regcv.sort_values()
coef_regcv_asc[:20]

bad ever          -5.540219
waste money       -4.914819
stop work         -4.069542
send back         -3.844052
work properly     -3.465388
buy product       -2.724912
hold charge       -2.612965
battery drain     -2.255692
every time        -2.198401
drop call         -2.175698
two month         -1.999704
one month         -1.818212
heating problem   -1.762694
month use         -1.693828
could get         -1.501411
buy new           -1.489918
one day           -1.332836
make call         -1.310990
text message      -1.300037
sim card          -1.204872
dtype: float64

In [37]:
# most positive mentioned
coef_regcv_desc = coef_regcv.sort_values(ascending = False)
coef_regcv_desc[:20]

excellent product    2.443020
good ever            2.319211
highly recommend     2.221265
love new             2.166652
great product        2.092117
absolutely love      2.069935
super fast           2.044935
love love            2.014252
everything need      1.992939
work perfectly       1.955589
good iphone          1.941756
good smartphone      1.918018
great love           1.892222
great price          1.825087
happy purchase       1.792988
great buy            1.780546
easy use             1.777148
love everything      1.776799
love samsung         1.729998
nice product         1.719287
dtype: float64

### Test Real Reviews

In [40]:
# test reviews data processing
test = pd.read_excel('test_review.xlsx', sheet_name = 'Sheet1')
test['review'] = test.apply(lambda row: str(row['review']).lower(),axis=1)
test['review_str'] = test.apply(lambda row: str(row['review']),axis=1)

test_lemma = token_lemma(test, 'review_str')

stopwords2 = stopwords.words() + ['phone', 'phones', 'cell', 'mobile']

vectorizer = TfidfVectorizer(ngram_range = (2, 3),
                             token_pattern = r'\b[a-zA-Z]{3,}\b', # detect text with three or more alphanumeric words
                             stop_words = stopwords2,
                             max_features = 200 # keep top 200 features
                            )

test_lemma2 = test_lemma.reset_index(drop = True)

# vectorization
corpus = list(test_lemma2["lemmatized"].values)
X = vectorizer.fit_transform(corpus)
features = X.toarray()
test_formodel=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
test_formodel['TARGET'] = test_lemma2['rating']  # adding target to the data frame
test_formodel.index = test_lemma2['review'] # adding review to the data frame for future review

testset = test_formodel.loc[:, ~test_formodel.columns.isin(['TARGET'])]

  'stop_words.' % sorted(inconsistent))


In [43]:
test_pred = reg_cv.predict(testset)
test_pred

array([ 9.69569407,  9.06505434,  8.26465512, 10.96202519,  8.63618571,
        9.08661408,  8.32710799,  9.06698603,  6.91532315,  7.84028028,
        6.68897733,  8.69979071,  9.29402914])

In [36]:
# predict select reivews scores
review1 = pd.DataFrame(testset.iloc[4,]).T
review2 = pd.DataFrame(testset.iloc[10,]).T
testset = review1.append(review2)
test_pred = reg_cv.predict(testset)
test_pred

array([8.63618571, 6.68897733])