In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import operator

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [16]:
# datasets
df = pd.read_csv('../data/product_level_data_with_img_feats.csv.gz')
df_text = pd.read_csv('../data/product_level_data_text_feats.csv.gz')

print(df["fake"].value_counts())

fake
0    1927
1    1400
Name: count, dtype: int64


In [3]:
# features
review_features = ['tfidf_review_body', 'n_of_reviews','avg_review_rating',
                   'avg_days_between_reviews', 'stdev_days_between_reviews',
                   'max_days_between_reviews', 'min_days_between_reviews', 
                   'share_helpful_reviews', 'share_1star', 'share_5star', 'share_photo', 'std_review_len']
network_features = ['pagerank', 'w_degree', 'clustering_coef', 'eigenvector_cent']
image_sim_features = ['min_sim', 'max_sim', 'mean_sim', 'std_sim', 'min_sim_review', 'max_sim_review',
       'mean_sim_review', 'std_sim_review', 'min_sim_product',
       'max_sim_product', 'mean_sim_product', 'std_sim_product']

In [10]:
# correlation matrix
# corr_table = df[review_features + network_features + image_sim_features].corr()
# corr_table.to_csv(path + 'Amazon Review Data/corr_table.csv')

In [11]:
def model_building(X_train, y_train, X_test, y_test, model):

	model.fit(X_train, y_train)

	y_pred = model.predict(X_test)
	cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
	probs = model.predict_proba(X_test)[:,1]

	# print(cm)
	print("AUC, Accuracy, TN, TP, F1 Score")
	print("{}, {}, {}, {}, {}".format(metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
															  sum(cm.diagonal()) / X_test.shape[0],
															  cm[0,0] / sum(cm[0,:]),
															  cm[1,1] / sum(cm[1,:]),
															  metrics.f1_score(y_test, y_pred, average='weighted')))

	return probs

In [12]:
def classification_results(df, features=None, stars=None):

	if features == None:
		X = df.drop(['product_ID','fake'], axis=1)
		features = list(X.columns)
		y = df['fake']
	else:
		X = df[features]
		y = df['fake']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
	print(X_train.shape, X_test.shape)

	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	print("="*10 + "Logistic Regression" + "="*10)
	model = LogisticRegression(max_iter=400)
	model_building(X_train, y_train, X_test, y_test, model)

	print("="*10 + "Random Forest" + "="*10)
	model = RandomForestClassifier(random_state=42, 
	                               n_estimators=100,
	                               min_samples_leaf=3,
	                               min_samples_split=6,
	                               max_features='sqrt',
	                               max_depth=40,
	                               bootstrap=True,
	                               n_jobs=-1)
	model_building(X_train, y_train, X_test, y_test, model)

	print("="*10 + "RF Feature Importance" + "="*10)
	imps = model.feature_importances_
	feat_imp = {features[i]: imps[i] for i in range(len(features))}
	
	if len(features) > 100:
		print(sorted(feat_imp.items(), key=operator.itemgetter(1), reverse=True)[:50])
	else:
		print(sorted(feat_imp.items(), key=operator.itemgetter(1), reverse=True))

	print("="*10 + "SVC Linear" + "="*10)
	model = SVC(kernel='linear', probability=True)
	model_building(X_train, y_train, X_test, y_test, model)

	print("="*10 + "XGBoost" + "="*10)
	model = xgb.XGBClassifier()
	model_building(X_train, y_train, X_test, y_test, model)

	return

In [14]:
# review features
print("\n+++++++++++++++++ Review Features ++++++++++++++++")
classification_results(df, review_features)


+++++++++++++++++ Review Features ++++++++++++++++
(2661, 12) (666, 12)
AUC, Accuracy, TN, TP, F1 Score
0.8381394920868607, 0.7852852852852853, 0.8447368421052631, 0.7062937062937062, 0.7837976660156704
AUC, Accuracy, TN, TP, F1 Score
0.873997055576003, 0.8108108108108109, 0.8578947368421053, 0.7482517482517482, 0.8099259041135368
[('share_photo', 0.20188266872719235), ('max_days_between_reviews', 0.12459007267055033), ('n_of_reviews', 0.11969581754798014), ('share_5star', 0.09614349095760143), ('avg_days_between_reviews', 0.09051123243506502), ('stdev_days_between_reviews', 0.0793914781735086), ('tfidf_review_body', 0.07305978614888074), ('avg_review_rating', 0.05466433356550596), ('std_review_len', 0.05329773632676681), ('share_helpful_reviews', 0.05276771196540719), ('share_1star', 0.049646505698017394), ('min_days_between_reviews', 0.004349165783523933)]
AUC, Accuracy, TN, TP, F1 Score
0.8385811556864189, 0.7867867867867868, 0.8578947368421053, 0.6923076923076923, 0.78461458471384

In [15]:
# image features
print("\n+++++++++++++++++ Image Features ++++++++++++++++\n")
classification_results(df, image_sim_features)


+++++++++++++++++ Image Features ++++++++++++++++

(2661, 12) (666, 12)
AUC, Accuracy, TN, TP, F1 Score
0.6310314685314685, 0.6171171171171171, 0.8394736842105263, 0.32167832167832167, 0.5876303289185565
AUC, Accuracy, TN, TP, F1 Score
0.5922846889952154, 0.5990990990990991, 0.7921052631578948, 0.34265734265734266, 0.577051627055728
[('min_sim_product', 0.09995597904768756), ('max_sim_product', 0.09051231460695743), ('mean_sim', 0.08532591470373589), ('std_sim_product', 0.08529751456554026), ('mean_sim_review', 0.08518236020769004), ('mean_sim_product', 0.08464118823374056), ('max_sim', 0.08053983402315099), ('max_sim_review', 0.07872105669106193), ('std_sim_review', 0.07813345029630095), ('min_sim_review', 0.07803385638269882), ('min_sim', 0.0775420051778659), ('std_sim', 0.07611452606356969)]
AUC, Accuracy, TN, TP, F1 Score
0.6286115200588884, 0.5930930930930931, 0.9, 0.1853146853146853, 0.5294013999556516
AUC, Accuracy, TN, TP, F1 Score
0.5690973500184027, 0.5720720720720721, 0.728

In [16]:
# network features
print("\n+++++++++++++++++ Network Features ++++++++++++++++\n")
classification_results(df, network_features)


+++++++++++++++++ Network Features ++++++++++++++++

(2661, 4) (666, 4)
AUC, Accuracy, TN, TP, F1 Score
0.8707443871917555, 0.8018018018018018, 0.8868421052631579, 0.6888111888111889, 0.798791120513186
AUC, Accuracy, TN, TP, F1 Score
0.889676113360324, 0.8213213213213213, 0.8394736842105263, 0.7972027972027972, 0.8214333867495662
[('clustering_coef', 0.39245346877008824), ('eigenvector_cent', 0.3113937594195792), ('w_degree', 0.15192030379870344), ('pagerank', 0.1442324680116291)]
AUC, Accuracy, TN, TP, F1 Score
0.8740338608759661, 0.7957957957957958, 0.9105263157894737, 0.6433566433566433, 0.7904053121444427
AUC, Accuracy, TN, TP, F1 Score
0.8833824070666176, 0.8108108108108109, 0.8368421052631579, 0.7762237762237763, 0.810727179728423


In [17]:
# top-2 network features
print("\n+++++++++++++++++ Top 2 Network Features ++++++++++++++++\n")
classification_results(df, ['eigenvector_cent', 'clustering_coef'])


+++++++++++++++++ Top 2 Network Features ++++++++++++++++

(2661, 2) (666, 2)
AUC, Accuracy, TN, TP, F1 Score
0.8552033492822967, 0.7927927927927928, 0.8736842105263158, 0.6853146853146853, 0.7900083210298238
AUC, Accuracy, TN, TP, F1 Score
0.8785793154214208, 0.8123123123123123, 0.8315789473684211, 0.7867132867132867, 0.8124300280982839
[('clustering_coef', 0.5298974477100252), ('eigenvector_cent', 0.4701025522899747)]
AUC, Accuracy, TN, TP, F1 Score
0.8537035333087966, 0.7987987987987988, 0.8552631578947368, 0.7237762237762237, 0.7974725433015319
AUC, Accuracy, TN, TP, F1 Score
0.8760627530364373, 0.7987987987987988, 0.8131578947368421, 0.7797202797202797, 0.7991161741345159


In [18]:
# all features
print("\n+++++++++++++++++ All Features ++++++++++++++++\n")
classification_results(df, review_features+image_sim_features+network_features)


+++++++++++++++++ All Features ++++++++++++++++

(2661, 28) (666, 28)
AUC, Accuracy, TN, TP, F1 Score
0.9211998527788002, 0.8573573573573574, 0.9, 0.8006993006993007, 0.8566464404686078
AUC, Accuracy, TN, TP, F1 Score
0.9323242546926758, 0.8603603603603603, 0.881578947368421, 0.8321678321678322, 0.8602667627191579
[('clustering_coef', 0.1879437408102332), ('eigenvector_cent', 0.16538684645682974), ('share_photo', 0.074828175274938), ('w_degree', 0.06317276263907272), ('n_of_reviews', 0.055488686779396584), ('max_days_between_reviews', 0.04880606551507896), ('pagerank', 0.04562423054213299), ('share_5star', 0.04113156224691351), ('tfidf_review_body', 0.02934999801330399), ('avg_days_between_reviews', 0.026644727708062695), ('stdev_days_between_reviews', 0.02447279088382289), ('avg_review_rating', 0.02354476788096144), ('std_review_len', 0.019727533795802096), ('share_1star', 0.019483917184519358), ('share_helpful_reviews', 0.01869076592971937), ('max_sim', 0.017983982948599812), ('min_

In [19]:
# all text features
print("\n+++++++++++++++++ All Text ++++++++++++++++\n")
classification_results(df_text)


+++++++++++++++++ All Text ++++++++++++++++

(2661, 1000) (666, 1000)
AUC, Accuracy, TN, TP, F1 Score
0.7688627162311372, 0.7297297297297297, 0.7736842105263158, 0.6713286713286714, 0.7292205769599386
AUC, Accuracy, TN, TP, F1 Score
0.8565145380934854, 0.7702702702702703, 0.9289473684210526, 0.5594405594405595, 0.759466996031958
[('conveni', 0.01681284432258938), ('also', 0.01680816715803607), ('realli', 0.00917607628823779), ('qualiti good', 0.00879490249073717), ('realli good', 0.008786481897046845), ('satisfi', 0.008266776494747878), ('alway', 0.006266128192765259), ('great product', 0.006254260307400919), ('ship', 0.006049554270938087), ('describ', 0.005942054425909881), ('like much', 0.0055237877240866705), ('worri', 0.005492030033769896), ('howev', 0.005322275835306055), ('simpl', 0.005170545681156141), ('compani', 0.004887716481104342), ('work great', 0.004655225913381732), ('year', 0.004513179946075493), ('exactli', 0.004321163474307411), ('disappoint', 0.004277469573588835), 