## Imports

In [1]:
from __future__ import annotations
from typing import Optional, Any, Tuple, List
import re

import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

## Reading Datafiles

In [2]:
def read_csv(filename: str) -> pd.DataFrame:
    return pd.read_csv(filename, sep=',', encoding='latin-1')

train_and_val_data = read_csv('data/ds4420_kaggle_train_data.csv')
test_data = read_csv('data/ds4420_kaggle_test_data.csv')

def split_label(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return data.drop('Label', axis=1), data['Label']

X_train_and_val, y_train_and_val = split_label(train_and_val_data)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val,
                                                      y_train_and_val,
                                                      test_size=0.2,
                                                      random_state=1)

X_test = test_data

## Custom Feature Extractors

In [3]:
class CapsBinaryFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a binary feature where 1 denotes that at least one word in the text
    is completely capitalized, which is assumed to be yelling.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        
    def __text_contains_caps_binary(self, text: str) -> int:
        return int(any(word.isupper() for word in text.split(' ')))
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> CapsBinaryFeatureExtractor:
        return self
        
    def transform(self, X: pd.Series) -> pd.DataFrame:
        yelling_feature = X.apply(self.__text_contains_caps_binary)
        return yelling_feature.values.reshape(-1, 1)

class CapsCountFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a numeric feature count of the number of words in the text that are
    completely capitalized, which is assumed to be yelling.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def __text_caps_count(self, text: str) -> int:
        return int(sum(1 for word in text.split(' ') if word.isupper()))
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> CapsCountFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        yelling_feature = X.apply(self.__text_caps_count)
        return yelling_feature.values.reshape(-1, 1)

class ExclamationBinaryFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into binary feature where 1 indicates the text containst at least one '!'.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> ExclamationBinaryFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        yelling_feature = X.apply(lambda text: 0 + ('!' in text))
        return yelling_feature.values.reshape(-1, 1)

class ExclamationCountFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a numeric feature count of the number of exclamation marks, which is
    assumed to be yelling.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> ExclamationCountFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        yelling_feature = X.apply(lambda text: text.count('!'))
        return yelling_feature.values.reshape(-1, 1)

class SwearingFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a binary feature where 1 denotes swearing occurs, which is usually
    seen in text as a series of 2 or more asterisks, e.g. ****.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> SwearingFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        yelling_feature = X.apply(lambda text: 0 + ('**' in text))
        return yelling_feature.values.reshape(-1, 1)

In [4]:
def clean_text(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

## Reusable Funtionality to Evaluate a Model and Save Results

In [5]:
def evaluate_and_save_results(pipeline: Pipeline, model_num: int) -> None:
    
    pipeline.fit(X_train, y_train)
    
    y_train_pred = pipeline.predict(X_train)
    y_val_pred = pipeline.predict(X_val)
    y_test_pred = pipeline.predict(X_test)
    
    print(classification_report(y_val, y_val_pred))
    
    test_data_exportable = test_data.copy()
    test_data_exportable['Label'] = y_test_pred
    test_data_exportable = test_data_exportable[['ID', 'Label']]
    test_data_exportable.to_csv(f'outputs/model{model_num}.csv', index=False)

## A "Switchboard" to Enable and Disable Running Each Model

In [6]:
models_to_run = {
    'model1': False,
    'model2': False,
    'model3': False,
    'model4': False,
    'model5': False,
    'model6': False,
    'model7': False,
    'model8': True,
    'model9': True,
    'model10': True,
}

## Model 1

In [7]:
if models_to_run['model1']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', LogisticRegression(random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 1)

## Model 2

In [8]:
if models_to_run['model2']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 2)

## Model 3

In [9]:
if models_to_run['model3']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1))
        ]
    )
    
    evaluate_and_save_results(pipeline, 3)

## Model 4

In [10]:
if models_to_run['model4']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', MinMaxScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', MultinomialNB()),
        ]
    )
    
    evaluate_and_save_results(pipeline, 4)

## Model 5

In [11]:
if models_to_run['model5']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', SVC(kernel='poly')),
        ]
    )
    
    evaluate_and_save_results(pipeline, 5)

## Model 6

In [12]:
if models_to_run['model6']:        
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('yelling', YellingBinaryFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 6)

## Model 7

In [13]:
if models_to_run['model7']:
        
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
            ('exclamation', ExclamationCountFeatureExtractor(), 'Text'),
            ('swearing', SwearingFeatureExtractor(), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', LogisticRegression(random_state=1, max_iter=1000)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 7)

In [15]:
if models_to_run['model8']:
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 8)

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      3272
           1       0.77      0.74      0.75      1465

    accuracy                           0.85      4737
   macro avg       0.83      0.82      0.82      4737
weighted avg       0.85      0.85      0.85      4737



In [None]:
if models_to_run['model9']:
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 9)