In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

import lightgbm as lgb

from functions import all_models as m
from functions import plot_importances

import warnings
warnings.filterwarnings("ignore")

In [2]:
reddit = pd.read_csv('Data/reddit_data.csv', index_col=0)

In [3]:
reddit.head()

Unnamed: 0,author,body,created_utc,permalink,score,subreddit,total_awards_received,editable,Polarity,Subjectivity,Sentiment,date,target
0,execdysfunction,Maybe. We need to be aiming higher,2021-04-03 19:41:59,/r/politics/comments/mj839d/schumer_senate_wil...,1,politics,0,,0.25,0.5,Positive,4/3/2021,53.414394
1,yappledapple,I hadn't heard that one. I think the ones stil...,2021-04-03 19:41:59,/r/politics/comments/mjcrfb/schumer_says_senat...,1,politics,0,,-0.166667,0.5,Negative,4/3/2021,53.414394
2,Tots4trump,“The statue was presented to the British as a ...,2021-04-03 19:41:56,/r/politics/comments/mjczhl/confederate_symbol...,1,politics,0,,0.295,0.43,Positive,4/3/2021,53.414394
3,DroopyMcCool,Is this something that is in the DOI's purview...,2021-04-03 19:41:55,/r/politics/comments/mj6klw/secretary_deb_haal...,1,politics,0,,0.068182,0.227273,Positive,4/3/2021,53.414394
4,FlyingRock,New York legalizing is definitely why it's bei...,2021-04-03 19:41:55,/r/politics/comments/mj839d/schumer_senate_wil...,1,politics,0,,0.033939,0.517576,Positive,4/3/2021,53.414394


In [4]:
reddit.drop(columns=['author', 'permalink', 'created_utc', 'Sentiment', 
                     'date', 'editable', 'total_awards_received'], inplace=True)

In [5]:
stop_words = stopwords.words('english')

def process(text):
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    
    text = text.lower() #convert to lowercase
    text = tokenizer.tokenize(text) #tokenize
    text = [word for word in text if word not in stop_words] #remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text] #lemmatize
    text = ' '.join(text) #convert list of words back to a string
    return text

In [6]:
left = reddit.loc[reddit['subreddit'] == 'democrats']
right = reddit.loc[reddit['subreddit'] == 'Conservative']

In [7]:
x = reddit.drop(columns=['target'])
y = reddit['target']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=16)

In [9]:
x_train_text = x_train['body'].apply(process)
x_test_text = x_test['body'].apply(process)

In [10]:
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern='[a-zA-Z0-9]+', max_features=1000)
x_train_text = vectorizer.fit_transform(x_train_text)
x_test_text = vectorizer.transform(x_test_text)

In [11]:
x_train_text_df = pd.DataFrame(x_train_text.todense(), columns = vectorizer.get_feature_names())
x_test_text_df = pd.DataFrame(x_test_text.todense(), columns = vectorizer.get_feature_names())

In [12]:
x_train.drop(columns=['body'], inplace=True)
x_train = pd.get_dummies(x_train)
x_train_text_df.set_index(x_train.index, inplace=True)
x_train = pd.concat([x_train, x_train_text_df], axis=1).reindex(x_train.index)

x_test.drop(columns=['body'], inplace=True)
x_test = pd.get_dummies(x_test)
x_test_text_df.set_index(x_test.index, inplace=True)
x_test = pd.concat([x_test, x_test_text_df], axis=1).reindex(x_test.index)

In [13]:
objects = [LinearRegression(), 
           Pipeline([('ss', StandardScaler(with_mean=False)), ('knn', KNeighborsRegressor())]),
           DecisionTreeRegressor(), 
           RandomForestRegressor(),
           XGBRegressor(),
           CatBoostRegressor(verbose=False),
          lgb.LGBMRegressor()]
index = ['Linear Regression', 'Nearest Neighbors', 'Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost', 'LightGBM']

In [None]:
models = m(x_train, x_test, y_train, y_test, objects=objects, index=index)

Linear Regression model fit...
Nearest Neighbors model fit...
Decision Tree model fit...


In [None]:
rmse = [2.1599, 2.3602, 2.8219, 2.2079, 2.1532, 2.1479, 2.1483]
rsquared = [.0334, -.1542, -.65, -.0101, .0394, .0441, .0437]

metrics = pd.DataFrame(np.array([rmse, rsquared]).T, index = index, columns = ['RMSE', 'R-Squared'])
display(metrics)

In [None]:
importances = sorted(list(zip(models[2].feature_importances_, vectorizer.get_feature_names())))[-20:]
plot = pd.DataFrame(importances, columns=['Importance', 'Feature'])

fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(data=plot, y='Feature', x='Importance')
ax.set_title('Decision Tree Feature Importances')
plt.show();

In [None]:
importances = sorted(list(zip(models[3].feature_importances_, vectorizer.get_feature_names())))[-20:]
plot = pd.DataFrame(importances, columns=['Importance', 'Feature'])

fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(data=plot, y='Feature', x='Importance')
ax.set_title('Random Forest Feature Importances')
plt.show();

In [None]:
importances = sorted(list(zip(models[4].feature_importances_, vectorizer.get_feature_names())))[-20:]
plot = pd.DataFrame(importances, columns=['Importance', 'Feature'])

fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(data=plot, y='Feature', x='Importance')
ax.set_title('XGBoost Feature Importances')
plt.show();

In [None]:
importances = sorted(list(zip(models[5].feature_importances_, vectorizer.get_feature_names())))[-20:]
plot = pd.DataFrame(importances, columns=['Importance', 'Feature'])

fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(data=plot, y='Feature', x='Importance')
ax.set_title('Cat Boost Feature Importances')
plt.show();

In [None]:
importances = sorted(list(zip(models[6].feature_importances_, vectorizer.get_feature_names())))[-20:]
plot = pd.DataFrame(importances, columns=['Importance', 'Feature'])

fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(data=plot, y='Feature', x='Importance')
ax.set_title('LightGBM Feature Importances')
plt.show();

## PCA

In [None]:
pca = PCA(n_components=20)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [None]:
pca_model = lgb.LGBMRegressor()
pca_model.fit(x_train_pca, y_train)

In [None]:
prediction = pca_model.predict(x_test_pca)
rmse = mean_squared_error(y_test, prediction, squared=False)
rsquared = r2_score(y_test, prediction)

print(f'R-Squared: {rsquared} \nRMSE: {rmse}')

In [None]:
importances = sorted(list(zip(pca_model.feature_importances_, vectorizer.get_feature_names())))[-20:]
plot = pd.DataFrame(importances, columns=['Importance', 'Feature'])

fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(data=plot, y='Feature', x='Importance')
ax.set_title('LightGBM Feature Importances')
plt.show();

In [None]:
pca_models = m(x_train_pca, x_test_pca, y_train, y_test, objects=objects, index=index)

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
sns.scatterplot(x=y_test, y=prediction)
ax.set_xlabel('Actual Approval Rating')
ax.set_ylabel('Predicted Approval Rating')
ax.set_title('Actual vs Predicted Approval Ratings');

In [None]:
cluster = KMeans()
x_train_cluster = cluster.fit_transform(x_train)
x_test_cluster = cluster.transform(x_test)

In [None]:
cluster_models = m(x_train_cluster, x_test_cluster, y_train, y_test, objects=objects, index=index)

# Model by Viewpoint -- Left

In [None]:
x = left.drop(columns=['target'])
y = left['target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=16)

In [None]:
x_train_text = x_train['body'].apply(process)
x_test_text = x_test['body'].apply(process)

In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern='[a-zA-Z0-9]+', max_features=1000)
x_train_text = vectorizer.fit_transform(x_train_text)
x_test_text = vectorizer.transform(x_test_text)

In [None]:
x_train_text_df = pd.DataFrame(x_train_text.todense(), columns = vectorizer.get_feature_names())
x_test_text_df = pd.DataFrame(x_test_text.todense(), columns = vectorizer.get_feature_names())

In [None]:
x_train.drop(columns=['body'], inplace=True)
x_train = pd.get_dummies(x_train)
x_train_text_df.set_index(x_train.index, inplace=True)
x_train = pd.concat([x_train, x_train_text_df], axis=1).reindex(x_train.index)

x_test.drop(columns=['body'], inplace=True)
x_test = pd.get_dummies(x_test)
x_test_text_df.set_index(x_test.index, inplace=True)
x_test = pd.concat([x_test, x_test_text_df], axis=1).reindex(x_test.index)

In [None]:
lightgbm = lgb.LGBMRegressor()
lightgbm.fit(x_train, y_train)

In [None]:
prediction = lightgbm.predict(x_test)
rmse = mean_squared_error(y_test, prediction, squared=False)
rsquared = r2_score(y_test, prediction)

print(f'R-Squared: {rsquared} \nRMSE: {rmse}')

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
sns.scatterplot(x=y_test, y=prediction)
ax.set_xlabel('Actual Approval Rating')
ax.set_ylabel('Predicted Approval Rating')
ax.set_title('Actual vs Predicted Approval Ratings');

# Model by Viewpoint - Right

In [None]:
x = right.drop(columns=['target'])
y = right['target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=16)

In [None]:
x_train_text = x_train['body'].apply(process)
x_test_text = x_test['body'].apply(process)

In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern='[a-zA-Z0-9]+', max_features=1000)
x_train_text = vectorizer.fit_transform(x_train_text)
x_test_text = vectorizer.transform(x_test_text)

In [None]:
x_train_text_df = pd.DataFrame(x_train_text.todense(), columns = vectorizer.get_feature_names())
x_test_text_df = pd.DataFrame(x_test_text.todense(), columns = vectorizer.get_feature_names())

In [None]:
x_train.drop(columns=['body'], inplace=True)
x_train = pd.get_dummies(x_train)
x_train_text_df.set_index(x_train.index, inplace=True)
x_train = pd.concat([x_train, x_train_text_df], axis=1).reindex(x_train.index)

x_test.drop(columns=['body'], inplace=True)
x_test = pd.get_dummies(x_test)
x_test_text_df.set_index(x_test.index, inplace=True)
x_test = pd.concat([x_test, x_test_text_df], axis=1).reindex(x_test.index)

In [None]:
lightgbm = lgb.LGBMRegressor()
lightgbm.fit(x_train, y_train)

In [None]:
prediction = lightgbm.predict(x_test)
rmse = mean_squared_error(y_test, prediction, squared=False)
rsquared = r2_score(y_test, prediction)

print(f'R-Squared: {rsquared} \nRMSE: {rmse}')

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
sns.scatterplot(x=y_test, y=prediction)
ax.set_xlabel('Actual Approval Rating')
ax.set_ylabel('Predicted Approval Rating')
ax.set_title('Actual vs Predicted Approval Ratings');