In [None]:
import pandas as pd
import numpy as np
from google.colab import files
import io
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron, LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,accuracy_score
from google.colab import files
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw.zip.


## **1.Data Preprocessing**

In [None]:
"""
1. Selectthe files train and test files to upload. (Modify the file names accordingly; train.csv and test.csv take here)
2. Importing stopwords from text.ENGLISH_STOP_WORDS and modifying the set to store in my_stop_words
3. Lemmatization method from class tutorial to take Adjectives, nouns, verbs and adverbs

"""
data_to_load = files.upload()
df_train = pd.read_csv(io.BytesIO(data_to_load['train.csv']))
df_test = pd.read_csv(io.BytesIO(data_to_load['test.csv']))
X_train = df_train['body']
y_train = df_train['subreddit']
X_test = df_test['body']

########################################################################################

stop_words_set = text.ENGLISH_STOP_WORDS
my_stop_words = ['far','make','u']
for word in stop_words_set:
  my_stop_words.append(word)
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
class New_LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos =get_wordnet_pos(t)) for t in word_tokenize(doc) if t.isalpha()]

Saving test.csv to test (1).csv
Saving train.csv to train (1).csv


# **2. Models**

# **A. Comparson of all models**


We quickly run suitable classifiers to find their accuracies and other metrics

In [None]:
"""
Using train_test_split to split traing data into two parts: 80% train and 20% test.
This method is preferred to Cross validation to save time.

"""
xtrain,xtest,ytrain,ytest = train_test_split(X_train,y_train, test_size=0.2,random_state=0)

classifiers = [
    ('KNN', KNeighborsClassifier()),
    ('KNN bagging', BaggingClassifier(base_estimator = KNeighborsClassifier(), max_samples=0.5, max_features=0.5)),
    ('decision tree', DecisionTreeClassifier()),
    ('random forests', RandomForestClassifier()),
    ('Adaboost', AdaBoostClassifier(n_estimators=50)),
    ('MNB', MultinomialNB(alpha=0.01)),
    ('Bernoulli NB', BernoulliNB(alpha=0.01)),
    ('Linear SVC', LinearSVC(tol=1e-5)),
    ('SVC with bagging', BaggingClassifier(base_estimator = LinearSVC(), max_samples=0.5, max_features=0.5))
    ]
#############################################################################
for model_name, model in classifiers:
  classifier = model
  print(model_name)
  pipeline_clf = Pipeline([
                           ('vect', CountVectorizer(tokenizer=New_LemmaTokenizer(), 
                                                    stop_words = my_stop_words)),
                           ('tfidf', TfidfTransformer(norm = 'l1')),
                           ('norm', Normalizer()),
                           ('clf', classifier)
                           
  ])
  pipeline_clf.fit(xtrain,ytrain)
  prediction_clf = pipeline_clf.predict(xtest)
  print(classification_report(ytest,prediction_clf))
  print('\n',accuracy_score(ytest,prediction_clf))
  print()
  print('.'*80,'\n')

# **B. Best hyperparamters: MNB**

In [None]:
"""
To find the best hyper paramters
"""

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__strip_accents' : ('unicode', False),
    'vect__max_df' : (0.5, 0.75,1.0),
    'vect__min_df':(1,2),
    'vect__binary':(True, False),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.005, 0.01,0.1, 1),
}
#####################################################################

print("Performing grid search")
print('.'*100)
grid_search = GridSearchCV(pipeline, parameters, cv =5)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
  print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search
....................................................................................................
Best score: 0.889
Best parameters set:
	clf__alpha: 0.01
	tfidf__norm: 'l1'
	vect__binary: False
	vect__max_df: 0.75
	vect__min_df: 1
	vect__ngram_range: (1, 1)
	vect__strip_accents: False


In [None]:
"""
Checking without TFID transformer
"""

pipeline_MNB_2 = Pipeline([
    ('vect', CountVectorizer(tokenizer=New_LemmaTokenizer(), 
                             stop_words=my_stop_words,)),
                             
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha = 0.01)),
])

parameters_MNB_2 = {
    'vect__max_df' : (0.5,0.75, 1)
}
#####################################################################
print("Performing grid search")
print('.'*100)
grid_search_MNB_2 = GridSearchCV(pipeline_MNB_2, parameters_MNB_2, cv=5)
grid_search_MNB_2.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_MNB_2.best_score_)
print("Best parameters set:")
best_parameters_MNB_2 = grid_search_MNB_2.best_estimator_.get_params()
for param_name in sorted(parameters_MNB_2.keys()):
  print("\t%s: %r" % (param_name, best_parameters_MNB_2[param_name]))

Performing grid search
....................................................................................................


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


Best score: 0.884
Best parameters set:
	vect__max_df: 0.5


# **C. Best Hyperparamters: LinearSVC**

In [None]:
pipeline_SVC = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC()),
])

parameters_SVC = {
    'vect__strip_accents' : ('unicode', False),
    'vect__max_df' : (0.5,0.75,0.95,1.0),
    'vect__min_df':(1,2),
    'tfidf__norm': ('l1', 'l2'),
    'clf__tol' : (1e-5,1e-6),
    'clf__C': (0.1, 0.5, 1,2,5),
    'clf__random_state' :(False, 0)
    
}
#####################################################################
print("Performing grid search")
print('.'*100)
grid_search_SVC = GridSearchCV(pipeline_SVC, parameters_SVC, cv = 5)
grid_search_SVC.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_SVC.best_score_)
print("Best parameters set:")
best_parameters_SVC = grid_search_SVC.best_estimator_.get_params()
for param_name in sorted(parameters_SVC.keys()):
  print("\t%s: %r" % (param_name, best_parameters_SVC[param_name]))

Performing grid search
....................................................................................................
Best score: 0.903
Best parameters set:
	clf__C: 1
	clf__random_state: False
	clf__tol: 1e-05
	tfidf__norm: 'l1'
	vect__max_df: 0.95
	vect__min_df: 1
	vect__strip_accents: 'unicode'


# **D. Best Hyperparamters: Random Forest**

In [None]:
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', RandomForestClassifier()),
])

parameters_rf = {
    'vect__strip_accents' : ('unicode', False),
    'vect__max_df' : (0.5,0.75,0.95,1.0),
    'clf__max_depth' : (2,5,10,100,200),
    'clf__random_state' :(False, 0)
    
}
#####################################################################
print("Performing grid search")
print('.'*100)
grid_search_rf = GridSearchCV(pipeline_rf, parameters_rf, cv = 5)
grid_search_rf.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_rf.best_score_)
print("Best parameters set:")
best_parameters_rf = grid_search_rf.best_estimator_.get_params()
for param_name in sorted(parameters_rf.keys()):
  print("\t%s: %r" % (param_name, best_parameters_rf[param_name]))

Performing grid search
....................................................................................................
Best score: 0.808
Best parameters set:
	clf__max_depth: 200
	clf__random_state: False
	vect__max_df: 0.5
	vect__strip_accents: 'unicode'


# **E. Meta Classifiers**

In [None]:
"""
estimators: MNB, Linear SVC
Final classifer: Logistic Regression
"""
xtrain,xtest,ytrain,ytest = train_test_split(X_train,y_train, test_size=0.2,random_state=0)
pip_MNB = Pipeline([('vect_MNB',CountVectorizer(tokenizer=New_LemmaTokenizer(),
                                                stop_words=my_stop_words,
                                                max_df = 0.5)),
                    ('tfidf_MNB',TfidfTransformer(norm = 'l1')),
                    ('norm_MNB', Normalizer()),
                    ('clf_MNB', MultinomialNB(alpha = 0.01))
])
pip_SVC = Pipeline([('vect_SVC',CountVectorizer(strip_accents='unicode',
                                                tokenizer=New_LemmaTokenizer(),
                                                stop_words=my_stop_words,
                                                max_df = 0.95)),
                    ('tfidf_SVC',TfidfTransformer(norm = 'l1')),
                    ('norm_SVC', Normalizer()),
                    ('clf_SVC', LinearSVC())
])

estimators = [
              ('MNB', pip_MNB),
              ('Linear SVC', pip_SVC),
]
meta_clf = StackingClassifier(
                              estimators = estimators,
                              final_estimator=LogisticRegression(max_iter=200)    
)
meta_clf_score = meta_clf.fit(xtrain, ytrain).score(xtest, ytest)
print(meta_clf_score)


In [None]:
"""
estimators: MNB, Linear SVC
Final Classifier: SGD
"""
xtrain,xtest,ytrain,ytest = train_test_split(X_train,y_train, test_size=0.2,random_state=0)
pip_MNB = Pipeline([('vect_MNB',CountVectorizer(tokenizer=New_LemmaTokenizer(),
                                                stop_words=my_stop_words,
                                                max_df = 0.5)),
                    ('tfidf_MNB',TfidfTransformer(norm = 'l1')),
                    ('norm_MNB', Normalizer()),
                    ('clf_MNB', MultinomialNB(alpha = 0.01))
])
pip_SVC = Pipeline([('vect_SVC',CountVectorizer(tokenizer=New_LemmaTokenizer(),
                                                stop_words=my_stop_words,
                                                max_df = 0.95)),
                    ('tfidf_SVC',TfidfTransformer(norm = 'l1')),
                    ('norm_SVC', Normalizer()),
                    ('clf_SVC', LinearSVC())
])

estimators = [
              ('MNB', pip_MNB),
              ('Linear SVC', pip_SVC),
]
meta_clf = StackingClassifier(
                              estimators = estimators,
                              final_estimator=SGDClassifier()    
)
meta_clf_score = meta_clf.fit(xtrain, ytrain).score(xtest, ytest)
print(meta_clf_score)


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


0.8946914113077256


In [None]:
"""
estimators: MNB, Linear SVC, Random Forest
Final classifer: Logistic Regression(max_iter = 200)
"""
xtrain,xtest,ytrain,ytest = train_test_split(X_train,y_train, test_size=0.2,random_state=0)
pip_MNB = Pipeline([('vect_MNB',CountVectorizer(tokenizer=New_LemmaTokenizer(),
                                                stop_words=my_stop_words,
                                                max_df = 0.75)),
                    ('tfidf_MNB',TfidfTransformer(norm = 'l1')),
                    ('norm_MNB', Normalizer()),
                    ('clf_MNB', MultinomialNB(alpha = 0.01))
])
pip_SVC = Pipeline([('vect_SVC',CountVectorizer(tokenizer=New_LemmaTokenizer(),
                                                stop_words=my_stop_words,
                                                max_df = 0.95)),
                    ('tfidf_SVC',TfidfTransformer(norm = 'l1')),
                    ('norm_SVC', Normalizer()),
                    ('clf_SVC', LinearSVC())
])
pip_rfC = Pipeline([('vect_rf',CountVectorizer(strip_accents='unicode',
                                               tokenizer=New_LemmaTokenizer(),
                                               stop_words=my_stop_words,
                                               max_df = 0.5)),
                    ('tfidf_rf',TfidfTransformer(norm = 'l1')),
                    ('norm_rf', Normalizer()),
                    ('clf_rf', RandomForestClassifier(max_depth=200))
])

estimators = [
              ('MNB', pip_MNB),
              ('Linear SVC', pip_SVC),
              ('Random Forest', pip_rfC)
]
meta_clf = StackingClassifier(
                              estimators = estimators,
                              final_estimator=LogisticRegression(max_iter=100)    
)
meta_clf_score = meta_clf.fit(xtrain, ytrain).score(xtest, ytest)
print(meta_clf_score)
final_fit = meta_clf.fit(X_train, y_train)
final_predict = meta_clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9132498921018558


# **3. Final Prediction**

In [None]:
meta_clf_predict = pd.DataFrame(final_predict)
meta_clf_predict.to_csv('meta_clf_predict_0.75_2.csv')
files.download('meta_clf_predict_0.75_2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>