In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import pipeline

In [2]:
# Load the cleaned and preprocessed data
df = pd.read_csv('../data/reddit_preprocessed.csv')

# Preview processed text
df[['subreddit', 'processed_text']].head()

Unnamed: 0,subreddit,processed_text
0,science,new study african reserve find dehorn rhino cu...
1,science,lowcalorie diet increase risk depression overw...
2,science,people world likely favor dominant authoritari...
3,science,selfperceived physical attractiveness link str...
4,science,efficient mrna delivery rest cell reverse hiv ...


In [None]:
# Setting model input and output variables

x = df['processed_text']
y = df['subreddit']

## Classification Model
- Predict subreddit (science vs. technology) based on processed Reddit post text
- Model options: Logistic Regression, Naive Bayes, SVM, etc.
- Steps:
  1. Train/test split
  2. TF-IDF vectorization
  3. Model training and evaluation

In [None]:
# Splitting dataset into training and testing splits

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 509, stratify = df['subreddit'])

# Vectorizing words in independent varaible datasets
tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words='english')
x_train_tf = tfidf.fit_transform(X_train)
x_test_tf = tfidf.transform(X_test)

In [None]:
# Adapted from Blueprints for Text Analytics Using Python, Chapter 6

# Building training pipleline
training_pipeline = pipeline.Pipeline(
    steps=[('tfidf', TfidfVectorizer(stop_words="english")),
           ('model', LinearSVC(random_state=509, tol=1e-5, dual='auto'))])

# Creating grid for hyperparameter tuning
grid_param = [{
    'tfidf__min_df': [5, 10],
    'tfidf__ngram_range': [(1, 3), (1, 6)],
    'model__penalty': ['l2'],
    'model__loss': ['hinge'],
    'model__max_iter': [10000]
}, {
    'tfidf__min_df': [5, 10],
    'tfidf__ngram_range': [(1, 3), (1, 6)],
    'model__C': [1, 10],
    'model__tol': [1e-2, 1e-3]}]

gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
                                   param_grid=grid_param,
                                   cv=5)
gridSearchProcessor.fit(X_train, Y_train)

best_params = gridSearchProcessor.best_params_
best_params = gridSearchProcessor.best_params_
print("Best alpha parameter identified by grid search ", best_params)

best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search ", best_result)

Best alpha parameter identified by grid search  {'model__loss': 'hinge', 'model__max_iter': 10000, 'model__penalty': 'l2', 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 3)}
Best result identified by grid search  0.864661986972744


In [6]:
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
                    'params']].sort_values(by=['rank_test_score'])[:5]

Unnamed: 0,rank_test_score,mean_test_score,params
0,1,0.864662,"{'model__loss': 'hinge', 'model__max_iter': 10..."
1,1,0.864662,"{'model__loss': 'hinge', 'model__max_iter': 10..."
8,3,0.861459,"{'model__C': 1, 'model__tol': 0.001, 'tfidf__m..."
5,3,0.861459,"{'model__C': 1, 'model__tol': 0.01, 'tfidf__mi..."
9,3,0.861459,"{'model__C': 1, 'model__tol': 0.001, 'tfidf__m..."


In [None]:
# applying model with tuned hyperparameters to test data
y_pred = gridSearchProcessor.predict(X_test)

In [None]:
# results
confusion_matrix(Y_test, y_pred)

array([[134,  17],
       [ 13, 151]])

In [9]:
# results
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

     science       0.91      0.89      0.90       151
  technology       0.90      0.92      0.91       164

    accuracy                           0.90       315
   macro avg       0.91      0.90      0.90       315
weighted avg       0.90      0.90      0.90       315

