In [31]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/Amazon_Unlocked_Mobile.csv")
df.head()

# Set the maximum number of rows to display (adjust as needed)
max_rows_to_display = 5  # Change this to the desired number of rows

# Pretty print the DataFrame with a maximum number of rows
print(df.head(max_rows_to_display).to_string(index=False))

                                                                                             Product Name Brand Name  Price  Rating                                                                                                                                                                                                                                                                                                                                                                                Reviews  Review Votes
"CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D700*FRONT CAMERA*ANDROID*SLIDER*QWERTY KEYBOARD*TOUCH SCREEN    Samsung 199.99       5 I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & 

In [32]:
df.dropna(inplace=True)
df[df['Rating'] != 3]
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0,0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0,0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0,0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0,1
9,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,3,It's battery life is great. It's very responsi...,0.0,0


In [33]:
round(df['Positively Rated'].mean(),2)

0.69

In [34]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state= 0)

In [35]:
print("X_train first entry: \n\n", X_train[0])
print("\n\nX_train shape: ", X_train.shape)

X_train first entry: 

 I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!


X_train shape:  (250751,)


In [36]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer 

vect = CountVectorizer().fit(X_train) 

In [37]:
vect.get_feature_names_out()[::3000]

array(['00', '858', 'approval', 'booth', 'cmon', 'dealsthanks', 'eclair',
       'ff', 'gsmpros', 'insertion', 'linkhttps', 'movment', 'outmatches',
       'preserving', 'reinstall', 'separating', 'stilllllll', 'todo',
       'veryclear'], dtype=object)

In [38]:
len(vect.get_feature_names_out())

56948

In [39]:
# Transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

In [40]:
X_train_vectorized

<250751x56948 sparse matrix of type '<class 'numpy.int64'>'
	with 6848862 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.linear_model import LogisticRegression 

model = LogisticRegression() 
model.fit(X_train_vectorized, y_train) 

In [42]:
from sklearn.metrics import roc_auc_score 

"""
The roc_auc_score function computes the AUC-ROC score, which measures the model's ability to distinguish 
between positive and negative classes. It returns a value between 0 and 1, where a higher value indicates 
better model performance
"""

predictions = model.predict(vect.transform(X_test))

print("AUC: ", round(roc_auc_score(y_test, predictions),2)) 

AUC:  0.89


In [43]:
#Get the feature names as numpy array
feature_names = np.array(vect.get_feature_names_out())

#Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['worst' 'junk' 'disappointing' 'garbage' 'upset' 'false' 'dirty'
 'unusable' 'crashes' 'freezes']

Largest Coefs: 
['excelent' 'excelente' 'exelente' 'loving' 'perfecto' 'loves' 'excellent'
 'complaints' 'superb' 'worried']



In [44]:
#Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names_out())

18952

In [None]:
X_train_vectorized = vect.transform(X_train) 

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
print("AUC: ", round(roc_auc_score(y_test, predictions),2))

In [46]:
feature_names = np.array(vect.get_feature_names_out())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest Tfidf: 
['seizing' 'srgb' 'excites' '1b' '625nits' 'brawns' '700nits' '16nm'
 'liquidating' 'reading___']

Largest Tfidf: 
['unacceptable' 'hi' 'gud' 'gucci' 'love' 'loved' 'unusable' 'case'
 'seller' '5stars']



In [47]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest coef: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest coef: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest coef: 
['not' 'worst' 'disappointed' 'waste' 'poor' 'terrible' 'return' 'stopped'
 'slow' 'horrible']

Largest coef: 
['love' 'great' 'amazing' 'excellent' 'perfect' 'loves' 'awesome' 'best'
 'perfectly' 'easy']



In [48]:
# These reviews are treated the same by our current model

print(model.predict(vect.transform(['Not an issue, phone is working', 
                                     'an issue, phone is not working'])))

[0 0]


In [50]:
# n-grams
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names_out())

217388

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

In [54]:
feature_names = np.array(vect.get_feature_names_out())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coef: \n{}\n'.format(feature_names[sorted_coef_index][:10]))
print('Largest Coef: \n{}\n'.format(feature_names[sorted_coef_index][:-11:-1]))

Smallest Coef: 
['junk' 'no good' 'not happy' 'worst' 'not satisfied' 'not worth'
 'garbage' 'wouldn recommend' 'disappointing' 'unusable']

Largest Coef: 
['excelent' 'excelente' 'exelente' 'perfecto' 'no issues' 'excellent'
 'loving' 'perfect' 'awsome' 'exelent']



In [55]:
print(model.predict(vect.transform(['not an issue, phone is working',
                                   'an issue, phone is not working'])))

[0 0]
