In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor 



In [2]:
# Read in the csv file as a pandas dataframe
df = pd.read_csv("customer_behavior_prediction_dataframe.csv")
df

Unnamed: 0,review_length,number_of_keywords,word_diversity,word_complexity,has_image,has_emoji,timing,count_helpful,creativity
0,279,145,60.161175,7.7,0,0,633,3,72.495699
1,85,37,62.242682,7.5,1,0,666,0,66.882998
2,101,46,46.116319,11.4,0,0,652,2,70.648072
3,52,25,84.124444,11.3,0,0,672,0,52.908236
4,66,34,66.000000,5.4,0,0,659,0,68.367608
...,...,...,...,...,...,...,...,...,...
983,15,14,63.000000,10.9,0,0,1314,2,
984,20,10,112.000000,0.8,0,1,1361,8,58.364846
985,27,13,40.824000,-0.3,0,1,1268,1,72.521003
986,11,7,33.880000,1.2,0,0,1489,7,


In [3]:
# Check if there is any NaN, only our target variable has some NA values. This is caused by the fact that 
# we are using the key word list to calculat the divergent thinking score. There is a chance that there is 
# only one keyword, which gives an empty result. 
df.isna().any()

review_length         False
number_of_keywords    False
word_diversity        False
word_complexity       False
has_image             False
has_emoji             False
timing                False
count_helpful         False
creativity             True
dtype: bool

In [6]:
# This counts for about 1/4 of the data, I need more data
nan_count = df["creativity"].isna().sum()
print(nan_count)
nan_count/df.shape[0]

252


0.2550607287449393

In [None]:
# The min of the creativity measure, divergent thinking score, is 36.18
min_creativity = df["creativity"].min(skipna=True)
min_creativity

In [None]:
sd_creativity = df["creativity"].std(skipna=True)
sd_creativity

In [None]:
# Fillin the NA with the value even less than the 36.18, this would also sort of serve as a hyperparameter
df["creativity"] = df["creativity"].fillna(min_creativity-sd_creativity)
df

In [None]:
df.isna().any()

In [None]:
plt.hist(df["review_length"], bins=np.arange(0,500,50))
plt.gca().set(title='The length of the review Histogram', xlabel="review length", ylabel='Frequency')
plt.show()

In [None]:
X = df.iloc[:, :-1]  
y = df.iloc[:, -1]   
x_dev, x_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_dev, y_dev, test_size=0.25, random_state=0)

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [None]:
y_val_pred = lr.predict(x_val)
y_test_pred = lr.predict(x_test)

# calculate the accuracy of the model on the validation and test sets
val_accuracy_r2 = r2_score(y_val, y_val_pred)
test_accuracy_r2 = r2_score(y_test, y_test_pred)

val_accuracy_mse = mean_squared_error(y_val, y_val_pred)
test_accuracy_mse = mean_squared_error(y_test, y_test_pred)

# print the accuracy of the model on the validation and test sets
print('r2 Validation accuracy:', val_accuracy_r2)
print('r2 Test accuracy:', test_accuracy_r2)
print('mse Validation accuracy:', val_accuracy_mse)
print('mse Test accuracy:', test_accuracy_mse)

In [None]:
# Plot the relationship between each variable with label 
fig, axs = plt.subplots(4,2, figsize = (20,20))
axs[0,0].scatter(df["review_length"], y)
axs[0,0].set_title("Length of the Review VS. Creativity")
axs[0,0].set_xlabel("Review Length")
axs[0,0].set_ylabel("Creativity")

axs[0,1].scatter(df["number_of_keywords"], y)
axs[0,1].set_title("No. keywords VS. Creativity")
axs[0,1].set_xlabel("No. keywords")
axs[0,1].set_ylabel("Creativity")

axs[1,0].scatter(df["word_diversity"], y)
axs[1,0].set_title("Word diversity VS. Creativity")
axs[1,0].set_xlabel("Word diversity")
axs[1,0].set_ylabel("Creativity")

axs[1,1].scatter(df["word_complexity"], y)
axs[1,1].set_title("Word complexity VS. Creativity")
axs[1,1].set_xlabel("Word complexity")
axs[1,1].set_ylabel("Creativity")

axs[2,0].scatter(df["has_image"], y)
axs[2,0].set_title("Has image VS. Creativity")
axs[2,0].set_xlabel("Has image")
axs[2,0].set_ylabel("Creativity")

axs[2,1].scatter(df["has_emoji"], y)
axs[2,1].set_title("Has emoji VS. Creativity")
axs[2,1].set_xlabel("Has emoji")
axs[2,1].set_ylabel("Creativity")

axs[3,0].scatter(df["timing"], y)
axs[3,0].set_title("Timing VS. Creativity")
axs[3,0].set_xlabel("timing")
axs[3,0].set_ylabel("Creativity")

axs[3,1].scatter(df["count_helpful"], y)
axs[3,1].set_title("No. helpful VS. Creativity")
axs[3,1].set_xlabel("No. helpful")
axs[3,1].set_ylabel("Creativity")


plt.show()

In [None]:
# Try the Decision tree Regressor 
regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)

y_val_pred2 = regressor.predict(x_val)
y_test_pred2 = regressor.predict(x_test)

# calculate the accuracy of the model on the validation and test sets
val_accuracy_r2_tree = r2_score(y_val, y_val_pred2)
test_accuracy_r2_tree = r2_score(y_test, y_test_pred2)

val_accuracy_mse_tree = mean_squared_error(y_val, y_val_pred2)
test_accuracy_mse_tree = mean_squared_error(y_test, y_test_pred2)

# print the accuracy of the model on the validation and test sets
print('r2 Validation accuracy:', val_accuracy_r2_tree)
print('r2 Test accuracy:', test_accuracy_r2_tree)
print('mse Validation accuracy:', val_accuracy_mse_tree)
print('mse Test accuracy:', test_accuracy_mse_tree)