In [13]:
# pip install numpy scikit-learn pandas
import requests
import pandas as pd
import datetime as dt
import numpy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [20]:
data = pd.read_csv("scraped_reddit.csv")

# Dropping all rows having atleast one NA entry
data= data[:20000]
data = data.dropna()
print(data.shape)
X = data["title"]
y = data["flair"]

#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(19777, 8)


In [21]:
# Using Simple Naive Bayes to train the model
text_clf = Pipeline([('vect', CountVectorizer(token_pattern=r'\b[^\d\W]+\b')),                    
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(X_train.values.astype('U'), y_train.values.astype('U'))

predicted = text_clf.predict(X_test.values.astype('U'))
print(numpy.mean(predicted == y_test.values.astype('U')))

0.5088473205257836


In [22]:
# Using linear SVM with SGD training

text_clf_svm = Pipeline([('vect', CountVectorizer(token_pattern=r'\b[^\d\W]+\b')),
                    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42)),
])
text_clf_svm = text_clf_svm.fit(X_train.values.astype('U'), y_train.values.astype('U'))

predicted = text_clf_svm.predict(X_test.values.astype('U'))
print(numpy.mean(predicted == y_test.values.astype('U')))

0.5245197168857432


In [23]:
# Used GridSearchCV for optimizing the parameters

X = X_train.values.astype('U')
y = y_train.values.astype('U')
 
# use a full grid over all parameters
param_grid = {"clf-svm__max_iter":[1200000],
              "clf-svm__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              "clf-svm__penalty": ["none", "l1", "l2"]}

# run grid search
grid_search = GridSearchCV(text_clf_svm, param_grid=param_grid)
grid_search.fit(X, y)

print(grid_search.best_score_)
 



0.5189935750335029
