In [1]:
# import necessary modules
import numpy as np
import pandas as pd

%matplotlib inline

In [47]:
def getTestPredictions(vectorizer, classifier):
    X = vectorizer.transform(df_test.text_clean)
    return classifier.predict(X)

df_test = pd.read_csv("cleaning_test_data.csv")
df_train = pd.read_csv("cleaned_training_data.csv")

In [None]:
# Importing all models and sklearn utilities

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

### Feature space generation

#### ColumnTransformer with TF-IDF

In [51]:
# Feature engineering!
column_trans = ColumnTransformer(
    [
        ('tfidf_text', TfidfVectorizer(lowercase=True,ngram_range=(1,2)), 'text_clean'),
        ('tfidf_bigram', TfidfVectorizer(lowercase=True), 'bigrams'),
        ('tfidf_htags', TfidfVectorizer(lowercase=True), 'hashtags'),
        ('tfidf_mentions', TfidfVectorizer(lowercase=True), 'mentions'),
        # These columns are not needed in the feature space
        ('drop_year', 'drop', 'year'),
        ('drop_full_text', 'drop', 'full_text'),
        ('drop_party_id', 'drop', 'party_id'),
    ],
    # All other columns will stay as-is in the feature space
    remainder='passthrough', n_jobs=-1,
)
train = column_trans.fit_transform(df_train)

#### Vanilla CountVectorization

In [None]:
transformer = CountVectorizer(lowercase=True,ngram_range=(1,2))
train = column_trans.fit_transform(df_train)

### Training with models

For brevity, only the two most successful vectorization techniques and the two most successful classification models have been shown below. As detailed in the report, the `MultinomialNB`, `LinearSVC`, and `GradientBoostingClassifier` were also experimented with.

In [None]:
train_labels = df_train.party_id # labels to learn against

#### LogisticRegression

For Logistic Regression, a grid-search optimization scheme returned a hyperparameter of `C`=10 and `solver`=`liblinear`, albeit for implementation purposes the latter has been swapped out for a stochastic solver -- helps tremendously with runtime concerns.

In [None]:
classifier = LogisticRegression(C=10, max_iter=10000, n_jobs=-1, solver='saga')
xtrain, xtest, ytrain, ytest = train_test_split(train, train_labels, test_size=0.3)
classifier.fit(xtrain, ytrain)
score = classifier.score(xtest, ytest)
print(f'{classifier}: {score}')

y_pred = getTestPredictions(column_trans, classifier)
submission_data = np.array([np.arange(y_pred.size), y_pred]).T
submission_df = pd.DataFrame(data=submission_data,columns=['id', 'party'])
submission_df.to_csv('submission_lrc.csv', index=False)

#### RidgeClassifierCV

The `cv`=8 parameter corresponds to performing cross-validation while training using the K-folds scheme with 8 folds.

In [None]:
classifier = RidgeClassifierCV(C=10, max_iter=10000, n_jobs=-1, solver='saga')
xtrain, xtest, ytrain, ytest = train_test_split(train, train_labels, test_size=0.3)
classifier.fit(xtrain, ytrain)
score = classifier.score(xtest, ytest)
print(f'{classifier}: {score}')

y_pred = getTestPredictions(transformer, classifier)
submission_data = np.array([np.arange(y_pred.size), y_pred]).T
submission_df = pd.DataFrame(data=submission_data,columns=['id', 'party'])
submission_df.to_csv('submission_rcv.csv', index=False)