In [0]:
import time
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
random_state = 7

def threshold(x):
    return 1 if x > 0.5 else 0
tot = 7700

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df = pd.read_csv('data/prepped/prepped.csv')
df['postText'] = df['postText'].transform(lambda x: np.str_(x))
df['targetKeywords'] = df['targetKeywords'].transform(lambda x:np.str_(x))

In [0]:
constructed_features = ['headline_length', 'words_avglength', 'stop_words_count', 'count_slang', 'count_punctuation', 'cardinal_beginings', 'phrase_clickbait', 'count_determiners', 'count_possessives', 'count_adverb', 'count_propernoun']
text_features = ['postText', 'targetKeywords']
all_used_features = text_features + constructed_features
# X = df[all_used_features]
# y = df['truthMean']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)


In [0]:
tfidf_vec = TfidfVectorizer(ngram_range=(2,2))
X = tfidf_vec.fit_transform(df['postText'].values)
y =  df['truthMean']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

In [0]:
postText_svr_start = time.time()
postText_svr = LinearSVR(random_state=0, max_iter=100)
postText_svr.fit(X_train[0:tot].toarray(), y_train[0:tot])
postText_svr.fit(X_train[tot:].toarray(), y_train[tot:])
postText_svr_preds = postText_svr.predict(X_test.toarray())
print(classification_report(np.round(y_test), np.round(postText_svr_preds)))
print(f'MSE: {mean_squared_error(y_test, postText_svr_preds)}')
print(f'Accuracy: {accuracy_score(np.round(y_test), np.round(postText_svr_preds))}')
print(f'linear svr tfidf total time: {time.time() - postText_svr_start}s.')

              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87      4981
         1.0       0.76      0.14      0.24      1619

    accuracy                           0.78      6600
   macro avg       0.77      0.56      0.56      6600
weighted avg       0.77      0.78      0.72      6600

MSE: 0.05138836346381614
Accuracy: 0.7786363636363637
linear svr tfidf total time: 15.455015420913696s.


In [0]:
postText_coefs = postText_svr.coef_
postText_coefs_indices = np.argpartition(postText_coefs, -10)[-10:]
top_feature_list = []
for ind in postText_coefs_indices:
    for key, val in tfidf_vec.vocabulary_.items():
            if val == ind:
                top_feature_list.append(key)
print(top_feature_list)

['conceptwhat conceptwhat', 'the most', 'you have', 'the best', 'merica merica', 'you should', 'how to', 'people who', 'good qgood', 'are the']


In [0]:
postText_logReg_start = time.time()
postText_logReg = LogisticRegression(max_iter=100)
postText_logReg.fit(X_train[0:tot].toarray(), np.vectorize(threshold)(y_train[0:tot]))
postText_logReg.fit(X_train[tot:].toarray(), np.vectorize(threshold)(y_train[tot:]))
postText_logReg_preds = postText_logReg.predict_proba(X_test.toarray())

In [0]:
print(classification_report(np.vectorize(threshold)(y_test), np.vectorize(threshold)(postText_logReg_preds[:, 1])))
print(f'MSE: {mean_squared_error(y_test, postText_logReg_preds[:, 1])}')
print(f'Accuracy: {accuracy_score( np.vectorize(threshold)(y_test), np.vectorize(threshold)(postText_logReg_preds[:, 1]))}')
print(f'log reg postText tfidf total time: {time.time() - postText_logReg_start}s.')

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      4981
           1       0.78      0.02      0.04      1619

    accuracy                           0.76      6600
   macro avg       0.77      0.51      0.45      6600
weighted avg       0.76      0.76      0.66      6600

MSE: 0.06286607335062663
Accuracy: 0.7584848484848485
linear svr tfidf total time: 21.11372685432434s.


In [0]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2))
X = tfidf_vectorizer.fit_transform(df['targetKeywords'].values)
y =  df['truthMean']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

In [0]:
keywords_svr_start = time.time()
keywords_svr = LinearSVR(random_state=0, max_iter=100)
keywords_svr.fit(X_train[0:tot].toarray(), y_train[0:tot])
keywords_svr.fit(X_train[tot:].toarray(), y_train[tot:])
keywords_svr_preds = keywords_svr.predict(X_test.toarray())
print(classification_report(np.round(y_test), np.round(keywords_svr_preds)))
print(f'MSE: {mean_squared_error(y_test, keywords_svr_preds)}')
print(f'Accuracy: {accuracy_score(np.round(y_test), np.round(keywords_svr_preds))}')
print(f'linear svr keywords total time: {time.time() - keywords_svr_start}s.')


              precision    recall  f1-score   support

         0.0       0.77      0.99      0.86      4981
         1.0       0.68      0.08      0.15      1619

    accuracy                           0.77      6600
   macro avg       0.73      0.53      0.51      6600
weighted avg       0.75      0.77      0.69      6600

MSE: 0.0599915237940072
Accuracy: 0.7654545454545455
linear svr keywords total time: 10.291276216506958s.


In [0]:
keywords_coefs = keywords_svr.coef_
keywords_coefs_indices = np.argpartition(keywords_coefs, -10)[-10:]
top_feature_list = []
for ind in keywords_coefs_indices:
    for key, val in tfidf_vec.vocabulary_.items():
            if val == ind:
                top_feature_list.append(key)
print(top_feature_list)

['have giant', 'as science', 'florida child', 'app hing', 'app is', 'grandfathtim roth', 'as russia', 'as san', 'app that', 'as secret']


In [0]:
postText_coefs = postText_svr.coef_
postText_coefs_indices = np.argpartition(postText_coefs, -10)[-10:]
top_feature_list = []
for ind in postText_coefs_indices:
    for key, val in tfidf_vec.vocabulary_.items():
            if val == ind:
                top_feature_list.append(key)
print(top_feature_list)

In [0]:
keywords_logReg_start = time.time()
keywords_logReg = LogisticRegression(max_iter=100)
keywords_logReg.fit(X_train[0:tot].toarray(), np.vectorize(threshold)(y_train[0:tot]))
keywords_logReg.fit(X_train[tot:].toarray(), np.vectorize(threshold)(y_train[tot:]))
keywords_logReg_preds = keywords_logReg.predict_proba(X_test.toarray())
print(classification_report(np.vectorize(threshold)(y_test), np.vectorize(threshold)(keywords_logReg_preds[:, 1])))
print(f'MSE: {mean_squared_error(y_test, keywords_logReg_preds[:, 1])}')
print(f'Accuracy: {accuracy_score( np.vectorize(threshold)(y_test), np.vectorize(threshold)(keywords_logReg_preds[:, 1]))}')
print(f'log reg keywords total time: {time.time() - keywords_logReg_start}s.')



              precision    recall  f1-score   support

           0       0.76      1.00      0.86      4981
           1       0.85      0.02      0.04      1619

    accuracy                           0.76      6600
   macro avg       0.80      0.51      0.45      6600
weighted avg       0.78      0.76      0.66      6600

MSE: 0.06317275353788535
Accuracy: 0.7589393939393939
log reg keywords total time: 10.066035985946655s.


In [0]:
#linear SVC on constructed features
SVC_start_time = time.time()
X = df[constructed_features]
y = df['truthMean']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
SVC_on_constructed = LinearSVC(dual=False, fit_intercept=True, C=1, loss='squared_hinge', max_iter=2000, random_state=42)
SVC_on_constructed.fit(X_train, np.round(y_train))
SVC_construct_preds = SVC_on_constructed.predict(X_test)
print(classification_report(np.vectorize(threshold)(y_test), SVC_construct_preds))
print(f'MSE: {mean_squared_error(y_test, SVC_construct_preds)}')
print(f'Accuracy: {accuracy_score( np.vectorize(threshold)(y_test), SVC_construct_preds)}')
print(f'Constructed feature: {time.time() - SVC_start_time}s.')

NameError: ignored

              precision    recall  f1-score   support

           0       0.82      0.95      0.88      4981
           1       0.72      0.37      0.49      1619

    accuracy                           0.81      6600
   macro avg       0.77      0.66      0.69      6600
weighted avg       0.80      0.81      0.79      6600

Accuracy: 0.8104545454545454
log reg keywords total time: 807.1724503040314s.
