In [1]:
import pandas as pd
import numpy as np
import re
import contractions
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [2]:
import random

np.random.seed(42)
random.seed(42)

In [3]:
birth_year_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\birth_year.csv")
ext_int_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\extrovert_introvert.csv")
feeling_thinking_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\feeling_thinking.csv")
gender_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\gender.csv")
judging_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\judging_perceiving.csv")
nationality_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\nationality.csv")
political_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\political_leaning.csv")
sensing_int_df = pd.read_csv(r"C:\Users\20223661\Downloads\data\sensing_intuitive.csv")

In [9]:
small_birth_year_df = birth_year_df.sample(n=1000)
small_regex_birth_year_df = birth_year_df.sample(n=1000)
small_normal_birth_year_df = birth_year_df.sample(n=1000)

small_birth_year_df = small_birth_year_df.reset_index(drop=True)
small_regex_birth_year_df = small_regex_birth_year_df.reset_index(drop=True)
small_normal_birth_year_df = small_normal_birth_year_df.reset_index(drop=True)

In [7]:
small_regex_birth_year_df['post'] = small_regex_birth_year_df['post'].apply(lambda x: re.compile(f"({re.escape(str(x))})"))

In [10]:
# Step 1: Expand contractions
small_normal_birth_year_df['post'] = small_normal_birth_year_df['post'].apply(contractions.fix)

# Step 2: Normalize Unicode to ASCII
small_normal_birth_year_df['post'] = small_normal_birth_year_df['post'].apply(
    lambda x: unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode('utf-8')
)

# Step 3: Remove extra spaces
small_normal_birth_year_df['post'] = small_normal_birth_year_df['post'].apply(
    lambda x: ' '.join(x.split())
)

# Step 4: Remove URLs
small_normal_birth_year_df['post'] = small_normal_birth_year_df['post'].apply(
    lambda x: re.sub(r"(https|http)?:\S*", "", x)
)

In [None]:
# nltk.download('punkt')       # For tokenization
# nltk.download('stopwords')   # For stopwords
# nltk.download('punkt_tab')   # For tokenization

In [11]:
stop_words = set(stopwords.words('english'))

# Remove stopwords from the 'post' column
small_normal_birth_year_df['post'] = small_normal_birth_year_df['post'].apply(
    lambda text: ' '.join(
        [word for word in word_tokenize(text) if word.lower() not in stop_words]
    )
)

In [None]:
from datetime import datetime
current_year = datetime.now().year

# Create a temporary DataFrame with unique authors
unique_authors = birth_year_df.loc[birth_year_df['auhtor_ID'].drop_duplicates().index]

# Calculate ages in the temporary DataFrame
unique_authors['age'] = current_year - unique_authors['birth_year']

# Plot the age distribution
plt.figure(figsize=(10, 6))
plt.hist(unique_authors['age'], bins=10, edgecolor='black', alpha=0.7)
plt.title('Age Distribution of Unique Authors')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.savefig(r"C:\Users\20223661\Downloads")
plt.show()

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, RegressorMixin

In [None]:
# Splitting the data into training and test sets
X = small_normal_birth_year_df['post']
y = small_normal_birth_year_df['birth_year']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline for TF-IDF + Ridge Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=55000)),  # Experiment with max_features
    ('regressor', Ridge())  # Ridge works well for regression; you can try others like RandomForestRegressor
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

In [36]:
X = small_birth_year_df['post']
y = small_birth_year_df['birth_year']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Ridge Regression Pipeline
# ridge_pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(max_features=10000)),
#     ('ridge', Ridge(alpha=1.0))
# ])

# Random Forest Pipeline
rf_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_features=15000)),
    ('rf', RandomForestRegressor(n_estimators=25, random_state=42))
])

# Cross-validate and compare
pipelines = {'Ridge': ridge_pipeline, 'Random Forest': rf_pipeline}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
    print(f"{name} MAE: {-scores.mean():.2f}")

Ridge MAE: 7.61
Random Forest MAE: 8.11


In [None]:
# Wrapper to ensure scikit-learn compatibility
class SklearnCompatibleXGBRegressor(XGBRegressor, BaseEstimator, RegressorMixin):
    def fit(self, X, y, **kwargs):
        return super().fit(X, y, **kwargs)

# Data Preparation
X = small_birth_year_df['post']
y = small_birth_year_df['birth_year']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Pipeline
xgb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),  # Convert sparse to dense
    ('xgb', SklearnCompatibleXGBRegressor(n_estimators=50, random_state=42))
])

# Cross-validate the pipeline
scores = cross_val_score(xgb_pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
print(f"XGBoost Pipeline MAE: {-scores.mean():.2f}")

# Train and Test the Pipeline
xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.2f}")


In [29]:
from transformers import BertTokenizer, BertModel
from transformers import AdamW
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import tqdm as notebook_tqdm

In [30]:
# Dataset preparation
class TextDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "target": torch.tensor(target, dtype=torch.float),
        }

# BERT-based Regression Model
class BertRegressor(nn.Module):
    def __init__(self, bert_model_name):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.fc(self.drop(pooled_output))

# Sklearn-compatible BERT wrapper
class SklearnBERTRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, bert_model_name="bert-base-uncased", max_length=128, batch_size=16, epochs=3, lr=5e-5):
        self.bert_model_name = bert_model_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr

        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = BertRegressor(self.bert_model_name).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()

    def fit(self, X, y):
        dataset = TextDataset(X, y, self.tokenizer, self.max_length)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.epochs):
            epoch_loss = 0
            for batch in dataloader:
                self.optimizer.zero_grad()

                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                targets = batch["target"].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                loss = self.criterion(outputs.squeeze(), targets)
                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()
            print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss:.4f}")

        return self

    def predict(self, X):
        dataset = TextDataset(X, [0] * len(X), self.tokenizer, self.max_length)  # Dummy targets
        dataloader = DataLoader(dataset, batch_size=self.batch_size)

        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                predictions.extend(outputs.squeeze().cpu().numpy())

        return predictions

# Data Preparation
X = small_birth_year_df["post"]
y = small_birth_year_df["birth_year"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and Evaluate
bert_regressor = SklearnBERTRegressor(epochs=3, batch_size=8)
bert_regressor.fit(X_train.tolist(), y_train.tolist())

y_pred = bert_regressor.predict(X_test.tolist())
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.2f}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Epoch 1/3, Loss: 391062460.5000
Epoch 2/3, Loss: 387728304.5000
Epoch 3/3, Loss: 385984464.5000
Test MAE: 1962.22
