In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import os

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

## Opening all the dataframes

In [64]:
df_original = pd.read_pickle('preprocessed_data/df_birthyear.pkl')
df_unique_author = df_original.loc[df_original['auhtor_ID'].drop_duplicates().index]
df_normalized = pd.read_pickle('preprocessed_data/df_normalized.pkl')

In [65]:
# selection the dataframe, change it here!
df = df_normalized

In [66]:
df.head(5)

Unnamed: 0,auhtor_ID,post,birth_year,age,age_range,clean_post,num_characters,num_special_symbols,num_emojis,contraction_count,num_contractions
34865,t2_3h3t0mb,record was 285 in May) **New States:** Illinoi...,1999,26,2,"['record', 'may', 'new', 'state', 'illinoi', '...",7985,424,1,41,41
38303,t2_ga9jkp22,Girly I am 20 with barely an a cup. I know it’...,2003,22,2,"['girli', 'bare', 'cup', 'know', 'difficult', ...",7864,213,11,30,30
36840,t2_8g3ecofl,"food you like in a single day, you're allowed ...",2001,24,2,"['food', 'like', 'singl', 'day', 'allow', 'tak...",2183,97,10,11,11
35322,t2_txn1p,so frustrating! Right. I am just frustrated th...,1999,26,2,"['frustrat', 'right', 'frustrat', 'thing', 'li...",3929,142,0,17,17
33673,t2_46463x14,"and smelly. shower? i’ll eat in the shower, ge...",1998,27,2,"['smelli', 'shower', 'eat', 'shower', 'get', '...",7819,235,1,35,35


## Ridge regression and Random Forest: Predicting with tokens

In [67]:
# Setup of models
X = df['clean_post']
y = df['age_range']

# Define the Ridge Regression Pipeline
def ridge_regression_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('ridge', Ridge(alpha=1.0))
    ])

# Define the Random Forest Pipeline
def random_forest_pipeline():
    return Pipeline([
        ('vectorizer', CountVectorizer(max_features=15000)),
        ('rf', RandomForestRegressor(n_estimators=10, random_state=42))
    ])

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [68]:
# MAE Ridge Regression
ridge_pipeline = ridge_regression_pipeline()
score = cross_val_score(ridge_pipeline, X, y, scoring='neg_mean_absolute_error', cv=kf)
print(f"Ridge Regression MAE: {-score.mean():.2f} ± {score.std():.2f}")

Ridge Regression MAE: 0.86 ± 0.04


In [69]:
# Training Ridge Regression
all_predictions = []

for train_index, test_index in tqdm(kf.split(X), desc="Cross-validating Ridge Regression"):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ridge_pipeline.fit(X_train, y_train)
    predictions = ridge_pipeline.predict(X_test)
    all_predictions.extend(predictions)
    
all_predictions = [int(pred) for pred in all_predictions]

Cross-validating Ridge Regression: 5it [00:01,  4.01it/s]


In [72]:
# Store results in a DataFrame
df['ridge_prediction'] = all_predictions
df[['post', 'clean_post', 'age_range', 'ridge_prediction']].head()

Unnamed: 0,post,clean_post,age_range,ridge_prediction
34865,record was 285 in May) **New States:** Illinoi...,"['record', 'may', 'new', 'state', 'illinoi', '...",2,3
38303,Girly I am 20 with barely an a cup. I know it’...,"['girli', 'bare', 'cup', 'know', 'difficult', ...",2,3
36840,"food you like in a single day, you're allowed ...","['food', 'like', 'singl', 'day', 'allow', 'tak...",2,3
35322,so frustrating! Right. I am just frustrated th...,"['frustrat', 'right', 'frustrat', 'thing', 'li...",2,3
33673,"and smelly. shower? i’ll eat in the shower, ge...","['smelli', 'shower', 'eat', 'shower', 'get', '...",2,3


In [53]:
# MAE Random Forest
forest_pipeline = random_forest_pipeline()
score = cross_val_score(forest_pipeline, X, y, scoring='neg_mean_absolute_error', cv=kf)
print(f"Random Forest MAE: {-score.mean():.2f} ± {score.std():.2f}")

Random Forest MAE: 0.91 ± 0.04


In [54]:
# Training Random Forest
all_predictions = []

for train_index, test_index in tqdm(kf.split(X), desc="Cross-validating Random Forest"):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    forest_pipeline.fit(X_train, y_train)
    predictions = forest_pipeline.predict(X_test)
    all_predictions.extend(predictions)
    
all_predictions = [int(pred) for pred in all_predictions]

Cross-validating Random Forest: 5it [00:27,  5.45s/it]


In [73]:
# Store results in a DataFrame
df['forest_prediction'] = all_predictions
df[['post', 'clean_post', 'age_range', 'forest_prediction']].head()

Unnamed: 0,post,clean_post,age_range,forest_prediction
34865,record was 285 in May) **New States:** Illinoi...,"['record', 'may', 'new', 'state', 'illinoi', '...",2,3
38303,Girly I am 20 with barely an a cup. I know it’...,"['girli', 'bare', 'cup', 'know', 'difficult', ...",2,3
36840,"food you like in a single day, you're allowed ...","['food', 'like', 'singl', 'day', 'allow', 'tak...",2,3
35322,so frustrating! Right. I am just frustrated th...,"['frustrat', 'right', 'frustrat', 'thing', 'li...",2,3
33673,"and smelly. shower? i’ll eat in the shower, ge...","['smelli', 'shower', 'eat', 'shower', 'get', '...",2,3


## Ridge regression and Random Forest: Predicting with stylometry features

In [96]:
# Setup of models
X = df[['post', 'num_characters', 'num_special_symbols', 'num_emojis', 'num_contractions']]
y = df['age_range']

# Define the Ridge Regression Pipeline
def ridge_regression_pipeline():
    return Pipeline([
        ('preprocessor', ColumnTransformer(
            transformers=[
                ('text', TfidfVectorizer(max_features=55000), 'post'),  # Process the 'post' text
                ('numerical', FunctionTransformer(), ['num_characters', 'num_special_symbols', 'num_emojis', 'num_contractions'])
            ]
        )),
        ('regressor', Ridge())
    ])

# Define the Random Forest Pipeline
def random_forest_pipeline():
    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(max_features=15000), 'post'),
            ('num', StandardScaler(), ['num_characters', 'num_special_symbols', 'num_emojis'])
        ]
    )
    
    # Create the pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('rf', RandomForestRegressor(n_estimators=25, random_state=42))
    ])
    
    return pipeline

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [103]:
# MAE Ridge Regression
ridge_pipeline = ridge_regression_pipeline()
score = cross_val_score(ridge_pipeline, X, y, scoring='neg_mean_absolute_error', cv=kf)
print(f"Ridge Regression MAE: {-score.mean():.2f} ± {score.std():.2f}")

Ridge Regression MAE: 0.86 ± 0.04


In [104]:
# Training Ridge Regression
all_predictions = []

for train_index, test_index in tqdm(kf.split(X), desc="Cross-validating Ridge Regression"):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    ridge_pipeline.fit(X_train, y_train)
    predictions = ridge_pipeline.predict(X_test)
    all_predictions.extend(predictions)
    
all_predictions = [int(pred) for pred in all_predictions]

Cross-validating Ridge Regression: 5it [00:02,  1.93it/s]


In [105]:
# Store results in a DataFrame
df['ridge_prediction_+features'] = all_predictions
df[['post', 'clean_post', 'age_range', 'ridge_prediction_+features']].head()

Unnamed: 0,post,clean_post,age_range,ridge_prediction_+features
34865,record was 285 in May) **New States:** Illinoi...,"['record', 'may', 'new', 'state', 'illinoi', '...",2,3
38303,Girly I am 20 with barely an a cup. I know it’...,"['girli', 'bare', 'cup', 'know', 'difficult', ...",2,3
36840,"food you like in a single day, you're allowed ...","['food', 'like', 'singl', 'day', 'allow', 'tak...",2,4
35322,so frustrating! Right. I am just frustrated th...,"['frustrat', 'right', 'frustrat', 'thing', 'li...",2,3
33673,"and smelly. shower? i’ll eat in the shower, ge...","['smelli', 'shower', 'eat', 'shower', 'get', '...",2,3


In [99]:
# MAE Random Forest
forest_pipeline = random_forest_pipeline()
score = cross_val_score(forest_pipeline, X, y, scoring='neg_mean_absolute_error', cv=kf)
print(f"Random Forest MAE: {-score.mean():.2f} ± {score.std():.2f}")

Random Forest MAE: 0.92 ± 0.04


In [100]:
# Training Random Forest
all_predictions = []

for train_index, test_index in tqdm(kf.split(X), desc="Cross-validating Random Forest"):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    forest_pipeline.fit(X_train, y_train)
    predictions = forest_pipeline.predict(X_test)
    all_predictions.extend(predictions)
    
all_predictions = [int(pred) for pred in all_predictions]

Cross-validating Random Forest: 5it [00:28,  5.66s/it]


In [102]:
# Store results in a DataFrame
df['forest_prediction_+features'] = all_predictions
df[['post', 'clean_post', 'age_range', 'forest_prediction_+features']].head()

Unnamed: 0,post,clean_post,age_range,forest_prediction_+features
34865,record was 285 in May) **New States:** Illinoi...,"['record', 'may', 'new', 'state', 'illinoi', '...",2,3
38303,Girly I am 20 with barely an a cup. I know it’...,"['girli', 'bare', 'cup', 'know', 'difficult', ...",2,3
36840,"food you like in a single day, you're allowed ...","['food', 'like', 'singl', 'day', 'allow', 'tak...",2,4
35322,so frustrating! Right. I am just frustrated th...,"['frustrat', 'right', 'frustrat', 'thing', 'li...",2,3
33673,"and smelly. shower? i’ll eat in the shower, ge...","['smelli', 'shower', 'eat', 'shower', 'get', '...",2,3


## Save dataframe

In [106]:
# get dataframe name
name_df = None
globals_copy = globals().copy()
for name, value in globals_copy.items():
    if value is df and name != 'df':
        name_df = name
        break
    
# to pickle
if not os.path.exists('predictions_data'):
    os.makedirs('predictions_data')
df.to_pickle(f'predictions_data/{name_df}.pkl')