## Imports

In [1]:
from __future__ import annotations
from typing import Optional, Any, Tuple, List
import re

import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

## Reading Datafiles

In [2]:
def read_csv(filename: str) -> pd.DataFrame:
    return pd.read_csv(filename, sep=',', encoding='latin-1')

train_and_val_data = read_csv('data/ds4420_kaggle_train_data.csv')
test_data = read_csv('data/ds4420_kaggle_test_data.csv')

def split_label(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return data.drop('Label', axis=1), data['Label']

X_train_and_val, y_train_and_val = split_label(train_and_val_data)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val,
                                                  y_train_and_val,
                                                  test_size=0.2,
                                                  random_state=1)

X_test = test_data

## Custom Feature Extractors

In [3]:
class ThresholdClassifier(BaseEstimator, TransformerMixin):
    
    model: BaseEstimator
    threshold: int

    def __init__(self, model: BaseEstimator, threshold: int=0.5) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.model = model
        self.threshold = threshold

    def fit(self, X: pd.Series, y: Optional[Any]=None) -> ThresholdClassifier:
        self.model.fit(X, y)
        return self
    
    def predict(self, X: pd.Series) -> pd.DataFrame:
        return self.model.predict_proba(X)[:,1] > self.threshold

class TextCleaner(BaseEstimator, TransformerMixin):

    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)

    def __clean_text(self, text: str) -> str:
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text

    def fit(self, X: pd.Series, y: Optional[Any]=None) -> TextCleaner:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(self.__clean_text)
        return feature

class InteractFeaturesTransformer(FunctionTransformer):
    """
    Multiplies two features together to yield a new feature.
    """
    
    def __init__(self) -> None:
        FunctionTransformer.__init__(self, self.__interact_features, validate=True)

    def __interact_features(self, X: pd.DataFrame) -> pd.Series:
        return X[:, 0:1] * X[:, 1:2]

class LengthFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into its length.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> LengthFeatureExtractor:
        return self
        
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(lambda text: len(text))
        return feature.values.reshape(-1, 1)

class CapsBinaryFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a binary feature where 1 denotes that at least one word in the text
    is completely capitalized, which is assumed to be yelling.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        
    def __text_contains_caps_binary(self, text: str) -> int:
        return int(any(word.isupper() for word in text.split(' ')))
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> CapsBinaryFeatureExtractor:
        return self
        
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(self.__text_contains_caps_binary)
        return feature.values.reshape(-1, 1)

class CapsCountFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a numeric feature count of the number of words in the text that are
    completely capitalized, which is assumed to be yelling.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def __text_caps_count(self, text: str) -> int:
        return int(sum(1 for word in text.split(' ') if word.isupper()))
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> CapsCountFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(self.__text_caps_count)
        return feature.values.reshape(-1, 1)

class CharBinaryFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into binary feature where 1 indicates the text containst at least one of
    the given character.
    """

    char: str
    
    def __init__(self, char: str) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)

        if len(char) != 1:
            raise ValueError('Character must be a string of length 1.')
        
        self.char = char
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> ExclamationBinaryFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(lambda text: 0 + (self.char in text))
        return feature.values.reshape(-1, 1)

class CharCountFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a numeric feature count of the number of exclamation marks, which is
    assumed to be yelling.
    """

    char: str
    
    def __init__(self, char: str) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
        if len(char) != 1:
            raise ValueError('Character must be a string of length 1.')
        
        self.char = char

    def fit(self, X: pd.Series, y: Optional[Any]=None) -> ExclamationCountFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(lambda text: text.count(self.char))
        return feature.values.reshape(-1, 1)

class SwearingFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Converts a text feature into a binary feature where 1 denotes swearing occurs, which is usually
    seen in text as a series of 2 or more asterisks, e.g. ****.
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> SwearingFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(lambda text: 0 + ('**' in text))
        return feature.values.reshape(-1, 1)

class RepeatedLetterFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    
    """
    
    def __init__(self) -> None:
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)

    def __repeated_letter_binary(self, text: str) -> int:
        return 1 if re.search(r'([a-zA-Z])\1{2,}', text) else 0
    
    def fit(self, X: pd.Series, y: Optional[Any]=None) -> RepeatedLetterFeatureExtractor:
        return self
    
    def transform(self, X: pd.Series) -> pd.DataFrame:
        feature = X.apply(self.__repeated_letter_binary)
        return feature.values.reshape(-1, 1)

## Reusable Funtionality to Evaluate a Model and Save Results

In [4]:
def evaluate_and_save_results(pipeline: Pipeline, model_num: int) -> None:

    X_val_prepared = X_val.copy()
    X_test_prepared = X_test.copy()
    X_val_prepared['Selected_Text'] = X_val['Text']
    X_test_prepared['Selected_Text'] = X_test['Text']
    
    pipeline.fit(X_train, y_train)
    
    y_val_pred = pipeline.predict(X_val_prepared)
    y_test_pred = pipeline.predict(X_test_prepared)
    
    print(confusion_matrix(y_val, y_val_pred))
    print(f'Precision: {precision_score(y_val, y_val_pred):.04f}')
    print(f'Recall: {recall_score(y_val, y_val_pred):.04f}')
    print(f'F1 Score: {f1_score(y_val, y_val_pred):.04f}')
    
    test_data_exportable = test_data.copy()
    test_data_exportable['Label'] = y_test_pred
    test_data_exportable = test_data_exportable[['ID', 'Label']]
    test_data_exportable.to_csv(f'outputs/model{model_num}.csv', index=False)

## A "Switchboard" to Enable and Disable Running Each Model

In [5]:
models_to_run = {
    'model1': False,
    'model2': False,
    'model3': False,
    'model4': False,
    'model5': False,
    'model6': False,
    'model7': False,
    'model8': False,
    'model9': False,
    'model10': False,
    'model11': False,
    'model12': False,
    'model13': False,
    'model14': False,
    'model15': False,
    'model16': False,
    'model17': False,
    'model18': True,
}

## Model 1

In [6]:
if models_to_run['model1']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', LogisticRegression(random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 1)

## Model 2

In [7]:
if models_to_run['model2']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 2)

## Model 3

In [8]:
if models_to_run['model3']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1))
        ]
    )
    
    evaluate_and_save_results(pipeline, 3)

## Model 4

In [9]:
if models_to_run['model4']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', MinMaxScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', MultinomialNB()),
        ]
    )
    
    evaluate_and_save_results(pipeline, 4)

## Model 5

In [10]:
if models_to_run['model5']:
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(stop_words='english'), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', SVC(kernel='poly')),
        ]
    )
    
    evaluate_and_save_results(pipeline, 5)

## Model 6

In [11]:
if models_to_run['model6']:        
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('yelling', YellingBinaryFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 6)

## Model 7

In [12]:
if models_to_run['model7']:
        
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
            ('exclamation', ExclamationCountFeatureExtractor(), 'Text'),
            ('swearing', SwearingFeatureExtractor(), 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', LogisticRegression(random_state=1, max_iter=1000)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 7)

In [13]:
if models_to_run['model8']:
    text_vectorizer = CountVectorizer(
        stop_words='english',
        # ngram_range=(1, 2)
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=200, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 8)

In [14]:
if models_to_run['model9']:
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=225, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 9)

In [15]:
if models_to_run['model10']:
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('regressor', RandomForestClassifier(n_estimators=225, random_state=1)),
        ]
    )

    param_grid = {
        'n_estimators': [220, 225, 230],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 2],
        'max_features': [None, 'sqrt', 'log2']
    }
    
    grid_search = GridSearchCV(RandomForestClassifier(random_state=1), param_grid, cv=3, scoring='f1')
    grid_search.fit(preprocessor.fit_transform(X_train_and_val), y_train_and_val)

    print(grid_search.best_params_)
    
    # evaluate_and_save_results(pipeline, 10)

In [16]:
if models_to_run['model11']:
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )

    voting_classifier = VotingClassifier(
        estimators=[
            ('logistic_regression', LogisticRegression(random_state=1)),
            ('random_forest', RandomForestClassifier(n_estimators=225, random_state=1)),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('voting', voting_classifier),
        ]
    )
    
    evaluate_and_save_results(pipeline, 11)

In [17]:
if models_to_run['model12']:    
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', ThresholdClassifier(RandomForestClassifier(n_estimators=225, random_state=1), 0.55)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 12)

In [18]:
if models_to_run['model13']:
    
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=221, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 13)

In [19]:
if models_to_run['model14']:
    
    text_chunk_vectorizer = CountVectorizer(
        analyzer='char',
        ngram_range=(3, 7),
    )

    text_word_vectorizer = CountVectorizer(
        stop_words='english',
    )

    text_chunk_vectorization_pipeline = Pipeline(
        steps=[
            ('cleaning', TextCleaner()),
            ('vectorizing', text_chunk_vectorizer),
        ]
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('chunks', text_chunk_vectorization_pipeline, 'Text'),
            #('words', text_word_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
            #('swearing', SwearingFeatureExtractor(), 'Text'),
            #('repeated_letters', RepeatedLetterFeatureExtractor(), 'Text')
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            #('classification', RandomForestClassifier(n_estimators=221, random_state=1)),
            ('classification', ThresholdClassifier(LogisticRegression(max_iter=1_000, random_state=1), 0.45)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 14)

In [20]:
if models_to_run['model15']:
    
    text_vectorizer = CountVectorizer(
        stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_vectorizer, 'Text'),
            ('numeric', StandardScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('capitalization', CapsCountFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=221, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 15)

In [21]:
if models_to_run['model16']:
    
    text_vectorizer = CountVectorizer(
        #stop_words='english',
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', MinMaxScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('text', text_vectorizer, 'Text'),
            # ('capitalization', Pipeline([
            #     ('extractor', CapsCountFeatureExtractor()),
            #     ('scaler', MinMaxScaler()),
            # ]), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(max_iter=1000, random_state=1)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 16)

In [36]:
if models_to_run['model17']:
    
    text_vectorizer = CountVectorizer(
        #stop_words='english',
        #ngram_range=(1, 2),
        #max_features=10_000,
        #min_df=11,
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', MinMaxScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('text', text_vectorizer, 'Text'),
            # ('capitalization', Pipeline([
            #     ('extractor', CapsCountFeatureExtractor()),
            #     ('scaler', MinMaxScaler()),
            # ]), 'Text'),
            #('exclamation', CharCountFeatureExtractor('!'), 'Text'),
            #('question', CharCountFeatureExtractor('?'), 'Text'),
            #('swearing', SwearingFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', ThresholdClassifier(LogisticRegression(max_iter=1000, random_state=1), 0.41)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 17)

## Model 18 (My Best Submission)

In [38]:
if models_to_run['model18']:
    
    text_vectorizer = CountVectorizer(
        #stop_words='english',
        #ngram_range=(1, 2),
        #max_features=10_000,
        #min_df=11,
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', MinMaxScaler(), ['User_Age', 'Time_of_Post', 'Population_Density']),
            ('text', text_vectorizer, 'Text'),
            # ('capitalization', Pipeline([
            #     ('extractor', CapsCountFeatureExtractor()),
            #     ('scaler', MinMaxScaler()),
            # ]), 'Text'),
            #('exclamation', CharCountFeatureExtractor('!'), 'Text'),
            #('question', CharCountFeatureExtractor('?'), 'Text'),
            #('swearing', SwearingFeatureExtractor(), 'Text'),
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', ThresholdClassifier(LogisticRegression(max_iter=1000, random_state=1), 0.41)),
        ]
    )
    
    evaluate_and_save_results(pipeline, 18)

[[2951  321]
 [ 385 1080]]
Precision: 0.7709
Recall: 0.7372
F1 Score: 0.7537
