In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import spacy
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

### Data Cleaning

In [2]:
# data cleaning

# read data
d = pd.read_csv('df_eng.csv', index_col = 0)
product = list(d['phone_url'].str.split('/'))


# parse brand and model from phont_url and add back to the dataframe
product2 = []
for i in product:
    product2.append(i[2])
    
product2_split = []
for p in product2:
    product2_split.append(p.split('-'))

brands = []
for p in product2_split:
    brands.append(p[0])
    
d['brand'] = brands

models = []
for p in product2_split:
    models.append(p[1])
    
d['models'] = models


# delete unused column
del d['lang']
del d['country']
del d['domain']
del d['score_max']
del d['author']
del d['product']


# drop columns with nan score
d.dropna(subset=['score'], inplace = True)
d.dropna(subset = ['extract'], inplace = True)

# lowercase extract text
d['extract'] = d.apply(lambda row: str(row['extract']).lower(),axis=1)

In [3]:
# d.to_csv('data_cleaned.csv')
d.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy


In [4]:
# sample 100,000 data
random.seed(1)
d_sample = d.sample(100000)

In [5]:
# all data

### Tokenization and Lemmatization

In [6]:
# tokenization and lemmatization function
def token_lemma(df,col):
    # tokenize by spacy
    nlp = spacy.load('en')
    df['token'] = df[col].apply(lambda x: nlp(x))
    # lemmatize by spacy
    df['lemmatized'] = df['token'].apply(lambda x: " ".join([token.lemma_ for token in x]))
    return df

In [7]:
d['extract_str'] = d.apply(lambda row: str(row['extract']),axis=1)

In [8]:
d.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models,extract_str
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy,as a diehard samsung fan who has had every sam...
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy,love the phone. the phone is sleek and smooth ...
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy,adequate feel. nice heft. processor's still sl...
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy,never disappointed. one of the reasons i've be...
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy,i've now found that i'm in a group of people t...


In [10]:
# lemmatization
d_lemma = token_lemma(d, 'extract_str')

In [11]:
d_lemma.head()

Unnamed: 0,phone_url,date,source,score,extract,brand,models,extract_str,token,lemmatized
0,/cellphones/samsung-galaxy-s8/,5/2/2017,Verizon Wireless,10.0,as a diehard samsung fan who has had every sam...,samsung,galaxy,as a diehard samsung fan who has had every sam...,"(as, a, diehard, samsung, fan, who, has, had, ...",as a diehard samsung fan who have have every s...
1,/cellphones/samsung-galaxy-s8/,4/28/2017,Phone Arena,10.0,love the phone. the phone is sleek and smooth ...,samsung,galaxy,love the phone. the phone is sleek and smooth ...,"(love, the, phone, ., the, phone, is, sleek, a...",love the phone . the phone be sleek and smooth...
2,/cellphones/samsung-galaxy-s8/,5/4/2017,Amazon,6.0,adequate feel. nice heft. processor's still sl...,samsung,galaxy,adequate feel. nice heft. processor's still sl...,"(adequate, feel, ., nice, heft, ., processor, ...",adequate feel . nice heft . processor 's still...
3,/cellphones/samsung-galaxy-s8/,5/2/2017,Samsung,9.2,never disappointed. one of the reasons i've be...,samsung,galaxy,never disappointed. one of the reasons i've be...,"(never, disappointed, ., one, of, the, reasons...",never disappoint . one of the reason -PRON- ha...
4,/cellphones/samsung-galaxy-s8/,5/11/2017,Verizon Wireless,4.0,i've now found that i'm in a group of people t...,samsung,galaxy,i've now found that i'm in a group of people t...,"(i, 've, now, found, that, i, 'm, in, a, group...",-PRON- have now find that -PRON- be in a group...


### Vectorization

In [12]:
# vectorization
stopwords2 = stopwords.words() + ['phone', 'phones', 'cell', 'mobile'] # + brand ?

vectorizer = TfidfVectorizer(ngram_range = (2,4),
                             token_pattern = r'\b[a-zA-Z]{3,}\b', # detect text with three or more alphanumeric words
                             max_df = 0.4,
                             min_df = 2,
                             stop_words = stopwords2,
                             max_features = 200 # keep top 200 features
                            )

In [13]:
d_lemma2 = d_lemma.reset_index(drop = True)

In [14]:
# vectorization
corpus = list(d_lemma2["lemmatized"].values)
X = vectorizer.fit_transform(corpus)
features = X.toarray()
d_formodel=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
d_formodel['TARGET'] = d_lemma2['score']  # adding target to the data frame
d_formodel.index = d_lemma2['extract'] # adding review to the data frame for future review

  'stop_words.' % sorted(inconsistent))


### Split Train and Test Sets

In [15]:
# feature selection and modeling - LASSO

# split train, test sets
train_df, test_df = train_test_split(d_formodel)

X_train = train_df.loc[:, ~train_df.columns.isin(['TARGET'])] # remove target
X_test = test_df.loc[:, ~test_df.columns.isin(['TARGET'])]

y_train = train_df['TARGET'] # get target
y_test = test_df['TARGET']

### Baseline (mean)

In [16]:
# baseline (mean)
np.mean(d['score'])

7.669917999999874

In [17]:
# baseline train RMSE
baseline_rmse_train = pd.DataFrame(train_df['TARGET'])
baseline_rmse_train['predict'] = np.mean(d['score'])
baseline_rmse_train['rmse'] = (baseline_rmse_train['TARGET'] - baseline_rmse_train['predict']) ** 2
(sum(baseline_rmse_train['rmse'])/ len(baseline_rmse_train)) ** 1/2   #RMSE

4.158199320608871

In [18]:
# train set R2
round(r2_score(y_true = baseline_rmse_train['TARGET'], y_pred = baseline_rmse_train['predict']), 4)

-0.0

In [19]:
# baseline test RMSE
baseline_rmse_test = pd.DataFrame(test_df['TARGET'])
baseline_rmse_test['predict'] = np.mean(d['score'])
baseline_rmse_test['rmse'] = (baseline_rmse_test['TARGET'] - baseline_rmse_test['predict']) ** 2
(sum(baseline_rmse_test['rmse'])/ len(baseline_rmse_test)) ** 1/2   #RMSE

4.188432703802451

In [20]:
# train set R2
round(r2_score(y_true = baseline_rmse_test['TARGET'], y_pred = baseline_rmse_test['predict']), 4)

-0.0

### LASSO Regression with alpha = 0.005

In [21]:
# LASSO model with alpha = 0.005
# fit model to train
reg = Lasso(alpha = 0.005) 
reg.fit(X_train, y_train)
coef_lasso = pd.Series(reg.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_lasso != 0)) + " variables and eliminated the other " +  str(sum(coef_lasso == 0)) + " variables")

Lasso picked 53 variables and eliminated the other 147 variables


In [22]:
# predict for train set
y_pred = reg.predict(X_train)

lasso_rmse_train = pd.DataFrame(train_df['TARGET'])
lasso_rmse_train['predict'] = y_pred
lasso_rmse_train['rmse'] = (lasso_rmse_train['TARGET'] - lasso_rmse_train['predict']) ** 2
(sum(lasso_rmse_train['rmse'])/ len(lasso_rmse_train)) ** 1/2   #RMSE

3.734858131351764

In [24]:
# train set R2
r2_score(y_true = lasso_rmse_train['TARGET'], y_pred = lasso_rmse_train['predict'])

0.1017798073549726

In [25]:
# predict for test set
y_pred = reg.predict(X_test)

lasso_rmse_test = pd.DataFrame(test_df['TARGET'])
lasso_rmse_test['predict'] = y_pred
lasso_rmse_test['rmse'] = (lasso_rmse_test['TARGET'] - lasso_rmse_test['predict']) ** 2
(sum(lasso_rmse_test['rmse'])/ len(lasso_rmse_test)) ** 1/2   #RMSE

3.77359440716561

In [26]:
# test set R2
r2_score(y_true = lasso_rmse_test['TARGET'], y_pred = lasso_rmse_test['predict'])

0.09904352153064644

### LASSO Regression Cross Validation Model

In [27]:
# LASSO CV model
# fit model to train
reg_cv = LassoCV()
reg_cv.fit(X_train, y_train)
coef_regcv = pd.Series(reg_cv.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_regcv != 0)) + " variables and eliminated the other " +  str(sum(coef_regcv == 0)) + " variables")



Lasso picked 200 variables and eliminated the other 0 variables


In [28]:
# predict for train set
y_pred = reg_cv.predict(X_train)

lassocv_rmse_train = pd.DataFrame(train_df['TARGET'])
lassocv_rmse_train['predict'] = y_pred
lassocv_rmse_train['rmse'] = (lassocv_rmse_train['TARGET'] - lassocv_rmse_train['predict']) ** 2
(sum(lassocv_rmse_train['rmse'])/ len(lassocv_rmse_train)) ** 1/2   # RMSE

3.468533592005176

In [29]:
# train set R2
r2_score(y_true = lassocv_rmse_train['TARGET'], y_pred = lassocv_rmse_train['predict'])

0.16582991866226826

In [30]:
# predict for test set
y_pred = reg_cv.predict(X_test)

lassocv_rmse_test = pd.DataFrame(test_df['TARGET'])
lassocv_rmse_test['predict'] = y_pred
lassocv_rmse_test['rmse'] = (lassocv_rmse_test['TARGET'] - lassocv_rmse_test['predict']) ** 2
(sum(lassocv_rmse_test['rmse'])/ len(lassocv_rmse_test)) ** 1/2   # RMSE

3.502554629022282

In [31]:
# train set R2
r2_score(y_true = lassocv_rmse_test['TARGET'], y_pred = lassocv_rmse_test['predict'])

0.16375504526411255

### Coefficients of Best Model (LASSO CV)

In [32]:
# most positive mentioned
coef_regcv_desc = coef_regcv.sort_values(ascending = False)
coef_regcv_desc[:20]

love pron            2.571221
excellent product    2.517053
good ever            2.449651
work perfectly       2.338617
highly recommend     2.310389
pron amazing         2.246708
pron love            2.201105
great product        2.149057
pron awesome         2.127979
thank pron           2.050783
work great           2.023977
great price          1.999415
easy use             1.969306
one good             1.873813
everything pron      1.818230
absolutely love      1.774539
far good             1.765647
pron fast            1.761508
value money          1.655968
happy pron           1.648827
dtype: float64

In [33]:
# most negative mentioned
coef_regcv_asc = coef_regcv.sort_values()
coef_regcv_asc[:20]

stop work       -4.466626
return pron     -4.132220
pron money      -3.769776
work properly   -3.455021
send pron       -3.016243
turn pron       -2.867191
pron start      -2.331378
work pron       -1.902355
month pron      -1.899091
pron back       -1.853459
call pron       -1.783820
charge pron     -1.769439
pron keep       -1.731743
pron charge     -1.671865
pron even       -1.509526
tell pron       -1.354031
make call       -1.348034
sim card        -1.274817
time pron       -1.210502
back pron       -1.206784
dtype: float64