In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

In [2]:
df_a_raw = pd.read_csv("preprocessed_data_with_cate.csv", sep = ',', engine='python', encoding='unicode_escape')

### (LIWC) Selective dropping from dataset

In [4]:
# Drop meaningless indicators

# Punctuation
df_a = df_a_raw.drop(['AllPunc', 'SemiC', 'Comma', 'Colon', 'Period', 'Parenth', 'OtherP','Quote',
                      'Apostro','Exclam','Dash','QMark'], axis = 1)

# Informal language
df_a.drop(['informal','filler','netspeak', 'assent','nonflu','swear'], inplace = True, axis = 1)

# Relativity - Meaningless if using bag of words analysis (likely case)
df_a.drop(['motion','space','time','relativ'], inplace = True, axis = 1)

# Time orientations - Drop all
df_a.drop(['focuspast','focuspresent','focusfuture'], inplace = True, axis = 1)

# Biological - Drop all
df_a.drop(['body','ingest','sexual','bio'], inplace = True, axis = 1)

# Perceptual - Keep them for now

# Cognitive processes - Keep only overall category
df_a.drop(['discrep','tentat','certain','cause','insight','differ'], inplace = True, axis = 1)

# Social Processes - Girl/Boy might be of some use? Keep for now
df_a.drop(['friend'], inplace = True, axis = 1)

# Affective processes - Drop all (sentiment analysis done separately)
df_a.drop(['affect','anx', 'posemo','negemo'], inplace = True, axis = 1)

# Other grammer - Comparisons might be interesting
df_a.drop(['number','quant','interrog', 'compare', 'adj', 'verb'], inplace = True, axis = 1)

# Linguistic dimensions - Comparisons might be interesting
df_a.drop(['negate','conj','adverb', 'auxverb', 'prep', 'article','ipron','they','shehe','you',
           'we','i','ppron','pronoun','fnctn'], inplace = True, axis = 1)

# All-predictor Feature Importance Analysis


In [11]:
# Drop correlated/unimportant variables 
df_liwc = df_a.drop(['User.Name', 'Likes', 'Comments', 'Shares', 'Love', 'Wow','Haha', 'Sad', 'Angry', 'Care', 
                     'Video.Share.Status','Is.Video.Owner.', 'Total.Views.For.All.Crossposts', 'Sponsor.Name', 
                     'Message_sentiment', 'Message_subjective', 
                     '-', 'No', 'Yes', 'is_linked', 'image_text_ind','sponsor_cate_effect','page_cate_effect',
                     'page_desc_sentiment', 'page_desc_subjective'], axis = 1)


# Drop summary vars
df_liwc.drop(['WC','WPS','Dic','Sixltr', 'Clout', 'Authentic','Tone'], inplace = True, axis = 1)

In [12]:
# Drop attributes for grouping (can be used in future iterations)
df_ready = df_liwc.drop(['Page.Name',], axis = 1)

df_ready.columns

Index(['Likes.at.Posting', 'Followers.at.Posting', 'Type',
       'Total.Interactions', 'Post.Views', 'Total.Views', 'Sponsor.Category',
       'Analytic', 'anger', 'sad', 'social', 'family', 'female', 'male',
       'cogproc', 'percept', 'see', 'hear', 'feel', 'health', 'drives',
       'affiliation', 'achieve', 'power', 'reward', 'risk', 'work', 'leisure',
       'home', 'money', 'relig', 'death', 'PageSinceCreated',
       'PageCreatedYear', 'PageCreatedMonth', 'PageCreatedDay',
       'PageCreatedHour', 'PostCreatedYear', 'PostCreatedMonth',
       'PostCreatedday', 'PostCreatedHour', 'Message_len',
       'Message_sentiment_category', 'Message_subjective_category',
       'link_title_len', 'link_desp_len', 'page_desc_len',
       'page_desc_sentiment_category', 'page_desc_subjective_category', 'Link',
       'Live Video', 'Live Video Complete', 'Live Video Scheduled',
       'Native Video', 'Photo', 'Status', 'Video', 'YouTube'],
      dtype='object')

### Generate correlation matrix, profiling report and split train/test

In [13]:
'''df_corr = df_ready.corr(method='pearson')#.drop(['Page.Category', 'Type'],axis = 1)
df_corr.to_csv("first_pass_corrmat.csv")'''

'df_corr = df_ready.corr(method=\'pearson\')#.drop([\'Page.Category\', \'Type\'],axis = 1)\ndf_corr.to_csv("first_pass_corrmat.csv")'

In [14]:
# Dummify
X = pd.get_dummies(df_ready, columns = ['Type', 'Sponsor.Category','Message_sentiment_category', 'Message_subjective_category', 
                                       'page_desc_sentiment_category', 'page_desc_subjective_category'], 
                  drop_first = True)
#X = df_ready.copy()
y = X['Total.Interactions']

X.drop(['Total.Interactions'], inplace = True, axis = 1)


In [15]:
'''profile = ProfileReport(df_ready, title="Pandas Profiling Report")
profile.to_file("baseline.html")'''


'profile = ProfileReport(df_ready, title="Pandas Profiling Report")\nprofile.to_file("baseline.html")'

In [16]:
from sklearn.model_selection import train_test_split

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 13)

### Define Models

In [12]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn import metrics
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

def forest(X, labels):
    forest = RandomForestRegressor(random_state=13, n_estimators=500, bootstrap=True, oob_score=True, max_features='sqrt')

    return forest.fit(X, labels)

def lightforest(X, labels):
    lightforest = RandomForestRegressor(random_state=13, n_estimators=100, bootstrap=True, oob_score=True, 
                                    max_depth=7, max_features='sqrt')
    return lightforest.fit(X, labels)

# https://xgboost.readthedocs.io/en/stable/python/python_api.html
def XGB(X,labels):
    xgb = XGBRegressor(random_state=13, n_estimators = 600)
    
    return xgb.fit(X, labels)

# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor
def LGBM(X,labels):
    model = LGBMRegressor(random_state=13, n_estimators = 500)
    return model.fit(X, labels)

# feature importance
#print(model.feature_importances_)

#print('\nAccuracy of random forest with 10-fold CV, 4 repeats: %.3f (STD: %.3f)' % (np.mean(scores_rf), np.std(scores_rf)))

  import pandas.util.testing as tm


## Model 1 - Both Extrinsic + Intrinsic Factors

In [18]:
import time
start = time.time()

# Takes some time to run

rf_model = forest(x_train,y_train)
#lrf_model = lightforest(x_train,y_train)
#xgb_model = XGB(x_train,y_train)
#lgbm_model = LGBM(x_train,y_train)

stop = time.time()
print(f"Training time: {stop - start} seconds")


Training time: 607.2746694087982 seconds


In [19]:
# Make predictions and print metrics
predictions = rf_model.predict(x_test)
#predictions = lgbm_model.predict(x_test)
#predictions = lrf_model.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R^2: ', rf_model.score(x_test, y_test))

Mean Absolute Error: 462.5015146414815
Root Mean Squared Error: 8309.275602291853
R^2:  0.24583248396491653


In [20]:
coeff_list = pd.DataFrame(list(zip(X.columns, rf_model.feature_importances_)), columns = ['predictor','feature importance']).sort_values(by='feature importance', ascending=False)

coeff_list.to_csv("Baseline_FeatureImportance.csv", index=False)
coeff_list.head(10)

Unnamed: 0,predictor,feature importance
2,Post.Views,0.199624
3,Total.Views,0.121278
37,PostCreatedHour,0.036247
39,link_title_len,0.03218
1,Followers.at.Posting,0.031749
38,Message_len,0.030607
0,Likes.at.Posting,0.030362
7,social,0.029147
12,percept,0.026853
17,drives,0.025816


## Model 2 - Extrinsic Factors only

In [6]:
df_a.columns

Index(['Page.Name', 'User.Name', 'Page.Category', 'Likes.at.Posting',
       'Followers.at.Posting', 'Type', 'Total.Interactions', 'Likes',
       'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care',
       'Video.Share.Status', 'Is.Video.Owner.', 'Post.Views', 'Total.Views',
       'Total.Views.For.All.Crossposts', 'Sponsor.Name', 'Sponsor.Category',
       'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
       'anger', 'sad', 'social', 'family', 'female', 'male', 'cogproc',
       'percept', 'see', 'hear', 'feel', 'health', 'drives', 'affiliation',
       'achieve', 'power', 'reward', 'risk', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'PageSinceCreated', 'PageCreatedYear',
       'PageCreatedMonth', 'PageCreatedDay', 'PageCreatedHour',
       'PostCreatedYear', 'PostCreatedMonth', 'PostCreatedday',
       'PostCreatedHour', 'Message_sentiment', 'Message_subjective',
       'Message_len', 'Message_sentiment_category',
       'M

In [27]:
# Drop correlated/unimportant variables 
df_liwc = df_a.drop(['User.Name', 'Likes', 'Comments', 'Shares', 'Love', 'Wow','Haha', 'Sad', 'Angry', 'Care', 
                     'Video.Share.Status','Is.Video.Owner.', 'Total.Views.For.All.Crossposts', 'Sponsor.Name', 
                     'Message_sentiment', 'Message_subjective', 'Total.Views',
                     '-', 'No', 'Yes', 'is_linked', 'image_text_ind','sponsor_cate_effect','page_cate_effect',
                     'page_desc_sentiment', 'page_desc_subjective'], axis = 1)


# Drop summary vars
df_liwc.drop(['WC','WPS','Dic','Sixltr', 'Clout', 'Authentic','Tone'], inplace = True, axis = 1)

In [28]:
df_ready = df_liwc.drop(['Page.Name'], axis=1)

df_ready.columns

Index(['Page.Category', 'Likes.at.Posting', 'Followers.at.Posting', 'Type',
       'Total.Interactions', 'Post.Views', 'Sponsor.Category', 'Analytic',
       'anger', 'sad', 'social', 'family', 'female', 'male', 'cogproc',
       'percept', 'see', 'hear', 'feel', 'health', 'drives', 'affiliation',
       'achieve', 'power', 'reward', 'risk', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'PageSinceCreated', 'PageCreatedYear',
       'PageCreatedMonth', 'PageCreatedDay', 'PageCreatedHour',
       'PostCreatedYear', 'PostCreatedMonth', 'PostCreatedday',
       'PostCreatedHour', 'Message_len', 'Message_sentiment_category',
       'Message_subjective_category', 'link_title_len', 'link_desp_len',
       'page_desc_len', 'page_desc_sentiment_category',
       'page_desc_subjective_category', 'Link', 'Live Video',
       'Live Video Complete', 'Live Video Scheduled', 'Native Video', 'Photo',
       'Status', 'Video', 'YouTube', 'crosspost', 'owned', 'share'],
      dtype='objec

In [29]:
from sklearn.model_selection import train_test_split

df_ext = df_ready.drop(['Analytic', 'anger', 'sad', 'social', 'family', 'female', 'male', 'cogproc', 'percept', 'see', 
                        'hear', 'feel', 'health', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'work', 
                        'leisure', 'home', 'money', 'relig', 'death'],axis = 1)

# Dummify
X = pd.get_dummies(df_ext, columns = ['Type', 'Sponsor.Category','Message_sentiment_category', 'Message_subjective_category', 
                                       'page_desc_sentiment_category', 'page_desc_subjective_category','Page.Category'], 
                  drop_first = True)

y = X['Total.Interactions']

X.drop(['Total.Interactions'], inplace = True, axis = 1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 13)

In [30]:
import time
start = time.time()

# Takes some time to run

rf_model = forest(x_train,y_train)
#lrf_model = lightforest(x_train,y_train)
#xgb_model = XGB(x_train,y_train)
#lgbm_model = LGBM(x_train,y_train)

stop = time.time()
print(f"Training time: {stop - start} seconds")


Training time: 685.4238214492798 seconds


In [31]:
# Make predictions and print metrics
predictions = rf_model.predict(x_test)
#predictions = lgbm_model.predict(x_test)
#predictions = lrf_model.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R^2: ', rf_model.score(x_test, y_test))

Mean Absolute Error: 461.77790439014944
Root Mean Squared Error: 8439.954928061265
R^2:  0.221924484378051


In [34]:
coeff_list = pd.DataFrame(list(zip(X.columns, rf_model.feature_importances_)), columns = ['predictor','feature importance']).sort_values(by='feature importance', ascending=False)

coeff_list.to_csv("Extrinsic_FeatureImportance.csv", index=False)
coeff_list.head(10)

Unnamed: 0,predictor,feature importance
2,Post.Views,0.354668
12,Message_len,0.09144
11,PostCreatedHour,0.082216
13,link_title_len,0.075985
1,Followers.at.Posting,0.070699
0,Likes.at.Posting,0.06617
10,PostCreatedday,0.041404
8,PostCreatedYear,0.026594
9,PostCreatedMonth,0.026245
14,link_desp_len,0.02227


## Model 3 - Intrinsic Factors only

In [41]:
# Drop correlated/unimportant variables 
df_liwc = df_a.drop(['User.Name', 'Likes', 'Comments', 'Shares', 'Love', 'Wow','Haha', 'Sad', 'Angry', 'Care', 
                     'Video.Share.Status','Is.Video.Owner.', 'Total.Views.For.All.Crossposts', 'Sponsor.Name', 
                     'Message_sentiment', 'Message_subjective', 
                     '-', 'No', 'Yes', 'is_linked', 'image_text_ind','sponsor_cate_effect','page_cate_effect',
                     'page_desc_sentiment', 'page_desc_subjective'], axis = 1)


# Drop summary vars
df_liwc.drop(['WC','WPS','Dic','Sixltr', 'Clout', 'Authentic','Tone'], inplace = True, axis = 1)

df_ready = df_liwc.drop(['Page.Name'], axis=1)

df_ready.columns

Index(['Page.Category', 'Likes.at.Posting', 'Followers.at.Posting', 'Type',
       'Total.Interactions', 'Post.Views', 'Total.Views', 'Sponsor.Category',
       'Analytic', 'anger', 'sad', 'social', 'family', 'female', 'male',
       'cogproc', 'percept', 'see', 'hear', 'feel', 'health', 'drives',
       'affiliation', 'achieve', 'power', 'reward', 'risk', 'work', 'leisure',
       'home', 'money', 'relig', 'death', 'PageSinceCreated',
       'PageCreatedYear', 'PageCreatedMonth', 'PageCreatedDay',
       'PageCreatedHour', 'PostCreatedYear', 'PostCreatedMonth',
       'PostCreatedday', 'PostCreatedHour', 'Message_len',
       'Message_sentiment_category', 'Message_subjective_category',
       'link_title_len', 'link_desp_len', 'page_desc_len',
       'page_desc_sentiment_category', 'page_desc_subjective_category', 'Link',
       'Live Video', 'Live Video Complete', 'Live Video Scheduled',
       'Native Video', 'Photo', 'Status', 'Video', 'YouTube', 'crosspost',
       'owned', 'share

In [42]:
df_int = df_ready.drop(['Page.Category', 'Likes.at.Posting', 'Followers.at.Posting','Post.Views', 'Total.Views', 
                     'Sponsor.Category','PageSinceCreated','PageCreatedYear', 'PageCreatedMonth', 'PageCreatedDay',
                     'PageCreatedHour', 'PostCreatedYear', 'PostCreatedMonth', 'PostCreatedday', 'PostCreatedHour',
                     'PageSinceCreated', 'PageCreatedYear', 'PageCreatedMonth', 'PageCreatedDay',
                     'link_title_len', 'link_desp_len', 'page_desc_len','Message_len',
                     'PageCreatedHour', 'PostCreatedYear','PostCreatedMonth', 'PostCreatedday', 'PostCreatedHour',
                     'Link','Live Video', 'Live Video Complete', 'Live Video Scheduled','Native Video', 'Photo', 'Status', 
                     'Video', 'YouTube', 'crosspost','owned', 'share'], axis = 1)
df_int.columns

Index(['Type', 'Total.Interactions', 'Analytic', 'anger', 'sad', 'social',
       'family', 'female', 'male', 'cogproc', 'percept', 'see', 'hear', 'feel',
       'health', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk',
       'work', 'leisure', 'home', 'money', 'relig', 'death',
       'Message_sentiment_category', 'Message_subjective_category',
       'page_desc_sentiment_category', 'page_desc_subjective_category'],
      dtype='object')

In [43]:
from sklearn.model_selection import train_test_split

# Dummify
X = pd.get_dummies(df_int, columns = ['Type', 'Message_sentiment_category', 'Message_subjective_category', 
                                       'page_desc_sentiment_category', 'page_desc_subjective_category',], 
                  drop_first = True)

y = X['Total.Interactions']

X.drop(['Total.Interactions'], inplace = True, axis = 1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 13)

In [44]:
import time
start = time.time()

# Takes some time to run

rf_model = forest(x_train,y_train)
#lrf_model = lightforest(x_train,y_train)
#xgb_model = XGB(x_train,y_train)
#lgbm_model = LGBM(x_train,y_train)

stop = time.time()
print(f"Training time: {stop - start} seconds")


Training time: 261.1273441314697 seconds


In [45]:
# Make predictions and print metrics
predictions = rf_model.predict(x_test)
#predictions = lgbm_model.predict(x_test)
#predictions = lrf_model.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R^2: ', rf_model.score(x_test, y_test))

Mean Absolute Error: 638.2403945758734
Root Mean Squared Error: 9582.163186468362
R^2:  -0.0029253758968645904


In [46]:
coeff_list = pd.DataFrame(list(zip(X.columns, rf_model.feature_importances_)), columns = ['predictor','feature importance']).sort_values(by='feature importance', ascending=False)

coeff_list.to_csv("Intrinsic_FeatureImportance.csv", index=False)
coeff_list.head(10)

Unnamed: 0,predictor,feature importance
0,Analytic,0.093945
3,social,0.075009
13,drives,0.070431
8,percept,0.0694
7,cogproc,0.062281
20,leisure,0.054974
16,power,0.053244
6,male,0.050973
9,see,0.039167
10,hear,0.033032
