# Creation of "simpler" models using Scikit-learn

In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectPercentile

def load_data():
    train = pd.read_csv('data/cs_subs_train.csv')
    val = pd.read_csv('data/cs_subs_val.csv')
    test = pd.read_csv('data/cs_subs_test.csv')
    
    X_train, y_train = train['title'], train['subreddit']
    X_val, y_val = val['title'], val['subreddit']
    X_test, y_test = test['title'], test['subreddit']
    
    label_encoder = pickle.load(open('pickles/label_encoder.pkl', 'rb'))
    
    return X_train, y_train, X_val, y_val, X_test, y_test, label_encoder

X_train, y_train, X_val, y_val, X_test, y_test, label_encoder = load_data()

#### Creating tf-idf vectors

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [94]:
def top_n_accuracy(y_true, probs, n=5):
    """
    Returns the top N accuracy for the classifier i.e., if the correct label is
    within the top N most likely labels according to the classifier.
    """
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argpartition(prob, -n)[-n:])    
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    predictions = np.array(predictions)
    return predictions.sum() / y_true.size

### Naive Bayes

In [91]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
bf = SelectPercentile(percentile=30)
X_train_nb = bf.fit_transform(X_train_vectors, y_train)
X_val_nb = bf.transform(X_val_vectors)
nb.fit(X_train_nb, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [93]:
nb_predictions = nb.predict(X_val_nb)
nb_probs = nb.predict_proba(X_val_nb)

print('Top 1 accuracy:\n', accuracy_score(y_val, nb_predictions))
print(classification_report(y_val, nb_predictions))

Top 1 accuracy:
 0.30009158441
             precision    recall  f1-score   support

          0       0.18      0.98      0.31      1352
          1       0.70      0.39      0.50       193
          3       0.00      0.00      0.00       115
          4       0.83      0.02      0.05       216
          5       1.00      0.04      0.08       126
          6       0.00      0.00      0.00       102
          7       1.00      0.15      0.26        92
          9       0.00      0.00      0.00        66
         10       0.00      0.00      0.00        13
         12       1.00      0.02      0.04        90
         13       0.00      0.00      0.00        14
         14       0.00      0.00      0.00        45
         15       0.00      0.00      0.00        86
         16       0.00      0.00      0.00        12
         17       0.00      0.00      0.00        20
         18       0.00      0.00      0.00        16
         19       0.00      0.00      0.00        40
         20   

  'precision', 'predicted', average, warn_for)


### Support Vector Machine.
#### Support vector machines 

In [99]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)

bf = SelectPercentile(percentile=30)
X_train_svm = bf.fit_transform(X_train_vectors, y_train)
X_val_svm = bf.transform(X_val_vectors)

svm.fit(X_train_svm, y_train)

svm_predictions = svm.predict(X_val_svm)
print('Top 1 accuracy:\n', accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

Top 1 accuracy:
 0.500712323191
             precision    recall  f1-score   support

          0       0.70      0.86      0.77      1352
          1       0.63      0.70      0.66       193
          3       0.15      0.07      0.09       115
          4       0.40      0.31      0.35       216
          5       0.48      0.50      0.49       126
          6       0.15      0.07      0.09       102
          7       0.89      0.71      0.79        92
          9       0.17      0.14      0.15        66
         10       0.29      0.15      0.20        13
         12       0.65      0.59      0.62        90
         13       0.82      1.00      0.90        14
         14       0.50      0.33      0.40        45
         15       0.38      0.42      0.40        86
         16       0.33      0.08      0.13        12
         17       0.81      0.85      0.83        20
         18       0.50      0.19      0.27        16
         19       0.72      0.70      0.71        40
         20  

  'precision', 'predicted', average, warn_for)
