## Imports

In [1]:
from __future__ import annotations
from typing import Optional, Any, Tuple, List

import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

## Reading Datafiles

In [2]:
def read_csv(filename: str) -> pd.DataFrame:
    return pd.read_csv(filename, sep=',', encoding='latin-1')

train_and_val_data = read_csv('data/ds4420_kaggle_train_data.csv')
test_data = read_csv('data/ds4420_kaggle_test_data.csv')

def split_label(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return data.drop('Label', axis=1), data['Label']

X_train_and_val, y_train_and_val = split_label(train_and_val_data)
X_test = test_data

## Splitting Into Training and Validation Sets

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.2, random_state=42)

X_train.head(3)

Unnamed: 0,ID,Text,Selected_Text,Selected_Text_Sentiment,Time_of_Post,User_Age,Population,Land_Size,Population_Density
13294,13294,In a period of exams days are for study and ni...,no funny time..,negative,7,10,102334404,995450,103
17124,17124,So bored.... couldn`t go to a-kon...,bored..,negative,22,37,33469203,425400,79
12560,12560,its nice to have no assignments for the night,nice,positive,17,69,26378274,318000,83


In [4]:
def evaluate_and_save_results(pipeline: Pipeline, model_num: int) -> None:
    pipeline.fit(X_train, y_train)
    
    y_train_pred = pipeline.predict(X_train)
    y_val_pred = pipeline.predict(X_val)
    y_test_pred = pipeline.predict(X_test)
    
    print(classification_report(y_val, y_val_pred))
    
    test_data_exportable = test_data.copy()
    test_data_exportable['Label'] = y_test_pred
    test_data_exportable = test_data_exportable[['ID', 'Label']]
    test_data_exportable.to_csv(f'outputs/model{model_num}.csv', index=False)

"""
def clean_text(text: str) -> List[str]:
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text
"""

"\ndef clean_text(text: str) -> List[str]:\n        text = text.lower()\n        text = re.sub(r'[^\\w\\s]', '', text)\n        return text\n"

In [5]:
models_to_run = {
    'model1': True,
    'model2': True,
    'model3': True,
    'model4': True,
    'model5': True,
    'model6': True,
    'model7': True,
    'model8': True,
    'model9': True,
    'model10': True,
}

In [6]:
if models_to_run['model1']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', LogisticRegression(random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 1)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      3261
           1       0.85      0.60      0.70      1476

    accuracy                           0.84      4737
   macro avg       0.84      0.77      0.80      4737
weighted avg       0.84      0.84      0.83      4737



In [7]:
if models_to_run['model2']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 2)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3261
           1       0.77      0.70      0.73      1476

    accuracy                           0.84      4737
   macro avg       0.82      0.80      0.81      4737
weighted avg       0.84      0.84      0.84      4737



In [8]:
if models_to_run['model3']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1))
        ]
    )
    
    evaluate_and_save_results(pipeline, 3)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3261
           1       0.77      0.72      0.74      1476

    accuracy                           0.84      4737
   macro avg       0.82      0.81      0.82      4737
weighted avg       0.84      0.84      0.84      4737



In [9]:
if models_to_run['model4']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', MinMaxScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', MultinomialNB()),
        ]
    )
    
    evaluate_and_save_results(pipeline, 4)

              precision    recall  f1-score   support

           0       0.82      0.93      0.87      3261
           1       0.77      0.54      0.64      1476

    accuracy                           0.81      4737
   macro avg       0.80      0.74      0.75      4737
weighted avg       0.80      0.81      0.80      4737



In [10]:
if models_to_run['model5']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', SVC(kernel='poly')),
        ]
    )
    
    evaluate_and_save_results(pipeline, 5)

              precision    recall  f1-score   support

           0       0.75      0.97      0.85      3261
           1       0.82      0.30      0.43      1476

    accuracy                           0.76      4737
   macro avg       0.78      0.63      0.64      4737
weighted avg       0.77      0.76      0.72      4737



In [11]:
if models_to_run['model6']:
    class YellingFeatureExtractor(BaseEstimator, TransformerMixin):

        def __init__(self) -> None:
            BaseEstimator.__init__(self)
            TransformerMixin.__init__(self)

        def __text_contains_yelling_binary(self, text: str) -> int:
            return int(any(word.isupper() for word in text.split(' ')))

        def fit(self, X: pd.Series, y: Optional[Any]=None) -> YellingFeatureExtractor:
            return self

        def transform(self, X: pd.Series) -> pd.DataFrame:
            yelling_feature = X.apply(self.__text_contains_yelling_binary)
            return yelling_feature.values.reshape(-1, 1)
        
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('yelling', YellingFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 6)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3261
           1       0.77      0.71      0.74      1476

    accuracy                           0.84      4737
   macro avg       0.82      0.81      0.81      4737
weighted avg       0.84      0.84      0.84      4737



In [12]:
if models_to_run['model7']:
    class YellingFeatureExtractor(BaseEstimator, TransformerMixin):

        def __init__(self) -> None:
            BaseEstimator.__init__(self)
            TransformerMixin.__init__(self)

        def __text_contains_yelling_binary(self, text: str) -> int:
            return int(any(word.isupper() for word in text.split(' ')))

        def fit(self, X: pd.Series, y: Optional[Any]=None) -> YellingFeatureExtractor:
            return self

        def transform(self, X: pd.Series) -> pd.DataFrame:
            yelling_feature = X.apply(self.__text_contains_yelling_binary)
            return yelling_feature.values.reshape(-1, 1)
        
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population', 'Land_Size']),
            ('yelling', YellingFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 8)

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3261
           1       0.77      0.71      0.74      1476

    accuracy                           0.85      4737
   macro avg       0.82      0.81      0.82      4737
weighted avg       0.84      0.85      0.84      4737

