In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import os

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

## Opening all the dataframes

In [43]:
df_original = pd.read_pickle('preprocessed_data/df_birthyear.pkl')
df_unique_author = df_original.loc[df_original['auhtor_ID'].drop_duplicates().index]
df_normalized = pd.read_pickle('preprocessed_data/df_normalized.pkl')
df_stylometry = np.nan

In [44]:
# selection the dataframe, change it here!
df = df_normalized

In [45]:
df.head(5)

Unnamed: 0,auhtor_ID,post,birth_year,age,age_range,clean_post
34865,t2_3h3t0mb,record was 285 in May) **New States:** Illinoi...,1999,26,2,"['record', 'may', 'new', 'state', 'illinoi', '..."
38303,t2_ga9jkp22,Girly I am 20 with barely an a cup. I know it’...,2003,22,2,"['girli', 'bare', 'cup', 'know', 'difficult', ..."
36840,t2_8g3ecofl,"food you like in a single day, you're allowed ...",2001,24,2,"['food', 'like', 'singl', 'day', 'allow', 'tak..."
35322,t2_txn1p,so frustrating! Right. I am just frustrated th...,1999,26,2,"['frustrat', 'right', 'frustrat', 'thing', 'li..."
33673,t2_46463x14,"and smelly. shower? i’ll eat in the shower, ge...",1998,27,2,"['smelli', 'shower', 'eat', 'shower', 'get', '..."


## Ridge regression with K-fold cross validation

In [46]:
# Setup of models
X = df['post']
y = df['age_range']

# Define the Ridge Regression Pipeline
def ridge_regression_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('ridge', Ridge(alpha=1.0))
    ])

# Define the Random Forest Pipeline
def random_forest_pipeline():
    return Pipeline([
        ('vectorizer', CountVectorizer(max_features=15000)),
        ('rf', RandomForestRegressor(n_estimators=25, random_state=42))
    ])

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [48]:
# MAE Ridge Regression
ridge_pipeline = ridge_regression_pipeline()
score = cross_val_score(ridge_pipeline, X, y, scoring='neg_mean_absolute_error', cv=kf)
print(f"Ridge Regression MAE: {-score.mean():.2f} ± {score.std():.2f}")

Ridge Regression MAE: 0.87 ± 0.05


In [49]:
# Training Ridge Regression
all_predictions = []

for train_index, test_index in tqdm(kf.split(X), desc="Cross-validating Ridge Regression"):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ridge_pipeline.fit(X_train, y_train)
    predictions = ridge_pipeline.predict(X_test)
    all_predictions.extend(predictions)
    
all_predictions = [int(pred) for pred in all_predictions]

Cross-validating Ridge Regression: 5it [00:02,  2.26it/s]


In [51]:
# Store results in a DataFrame
df['ridge_prediction'] = all_predictions
df.head()

Unnamed: 0,auhtor_ID,post,birth_year,age,age_range,clean_post,ridge_prediction
34865,t2_3h3t0mb,record was 285 in May) **New States:** Illinoi...,1999,26,2,"['record', 'may', 'new', 'state', 'illinoi', '...",3
38303,t2_ga9jkp22,Girly I am 20 with barely an a cup. I know it’...,2003,22,2,"['girli', 'bare', 'cup', 'know', 'difficult', ...",3
36840,t2_8g3ecofl,"food you like in a single day, you're allowed ...",2001,24,2,"['food', 'like', 'singl', 'day', 'allow', 'tak...",4
35322,t2_txn1p,so frustrating! Right. I am just frustrated th...,1999,26,2,"['frustrat', 'right', 'frustrat', 'thing', 'li...",3
33673,t2_46463x14,"and smelly. shower? i’ll eat in the shower, ge...",1998,27,2,"['smelli', 'shower', 'eat', 'shower', 'get', '...",3


In [53]:
# MAE Random Forest
forest_pipeline = random_forest_pipeline()
score = cross_val_score(forest_pipeline, X, y, scoring='neg_mean_absolute_error', cv=kf)
print(f"Random Forest MAE: {-score.mean():.2f} ± {score.std():.2f}")

Random Forest MAE: 0.91 ± 0.04


In [54]:
# Training Random Forest
all_predictions = []

for train_index, test_index in tqdm(kf.split(X), desc="Cross-validating Random Forest"):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    forest_pipeline.fit(X_train, y_train)
    predictions = forest_pipeline.predict(X_test)
    all_predictions.extend(predictions)
    
all_predictions = [int(pred) for pred in all_predictions]

Cross-validating Random Forest: 5it [00:27,  5.45s/it]


In [55]:
# Store results in a DataFrame
df['forest_prediction'] = all_predictions
df.head()

Unnamed: 0,auhtor_ID,post,birth_year,age,age_range,clean_post,ridge_prediction,forest_prediction
34865,t2_3h3t0mb,record was 285 in May) **New States:** Illinoi...,1999,26,2,"['record', 'may', 'new', 'state', 'illinoi', '...",3,3
38303,t2_ga9jkp22,Girly I am 20 with barely an a cup. I know it’...,2003,22,2,"['girli', 'bare', 'cup', 'know', 'difficult', ...",3,2
36840,t2_8g3ecofl,"food you like in a single day, you're allowed ...",2001,24,2,"['food', 'like', 'singl', 'day', 'allow', 'tak...",4,4
35322,t2_txn1p,so frustrating! Right. I am just frustrated th...,1999,26,2,"['frustrat', 'right', 'frustrat', 'thing', 'li...",3,3
33673,t2_46463x14,"and smelly. shower? i’ll eat in the shower, ge...",1998,27,2,"['smelli', 'shower', 'eat', 'shower', 'get', '...",3,3


## Save dataframe

In [62]:
# get dataframe name
name_df = None
globals_copy = globals().copy()
for name, value in globals_copy.items():
    if value is df and name != 'df':
        name_df = name
        break
    
# to pickle
if not os.path.exists('predictions_data'):
    os.makedirs('predictions_data')
df.to_pickle(f'predictions_data/{name_df}.pkl')