In [2]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import nltk
nltk.download()
from nltk.stem.snowball import SnowballStemmer



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [21]:
data = pd.read_csv('./bmv_training_set.csv')
test_data = pd.read_csv('./bmv_test_set.csv')

In [22]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])


In [23]:
def clean_attributes(s):
    a = dict(id=s['id'], additionalAttributes=s['additionalAttributes'], label=s['label'])
    for i in s['additionalAttributes'].split(";"):
        x = i.split("=")
        if len(x) == 2:
            a[x[0]] = x[1]
        else:
            y = i.split(":")
            if len(y) == 2:
                a[y[0]] = y[1]
            else:
                print "====",i

    return pd.Series(dict(id=s['id'], additionalAttributes=s['additionalAttributes'], label=s['label']))

In [24]:
def tfidf_transformation(data,test_data,test_size=0.3):
    if type(test_data) == type(None):
        train_data ,test_data = train_test_split(data,test_size=test_size)
        return train_data['additionalAttributes'], train_data['label'], test_data['additionalAttributes'], test_data['label']
    else:
        return data['additionalAttributes'], data['label'], test_data['additionalAttributes'], None

In [25]:
def MultinomialNB_classifier(x_train,y_train,x_test,y_test,apply_grid_search=False,is_training=True):
    text_clf = Pipeline([('vect', StemmedCountVectorizer(ngram_range=(1,2),stop_words='english')),
                   ('tfidf', TfidfTransformer(use_idf=True,norm='l1')),
                   ('clf', MultinomialNB(fit_prior=False)),])
    if apply_grid_search:
        parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                       'tfidf__use_idf': (True, False),
                       'tfidf__norm' :('l1','l2',None),
                       'clf__fit_prior': (True, False),
         }
        gs_clf_mnb = GridSearchCV(text_clf, parameters_svm, n_jobs=-1)
        gs_clf_mnb = gs_clf_mnb.fit(x_train, y_train)
        print gs_clf_mnb.best_score_
        print gs_clf_mnb.best_params_
        predicted = text_clf.predict(x_test)
    else:
        text_clf = text_clf.fit(x_train, y_train)
        predicted = text_clf.predict(x_test)
    if is_training and type(y_test) != type(None):
        return np.mean(predicted == y_test)
    else:
        return predicted

In [26]:
def svm_classifier(x_train,y_train,x_test,y_test,apply_grid_search=False,is_training=True):
    text_clf_svm = Pipeline([('vect', StemmedCountVectorizer(stop_words='english',ngram_range=(1,2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf-svm', SVC(kernel='linear', C = 10.0,degree=10,probability=True,tol=1e-3)),
     ])
    if apply_grid_search:
        parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                       'tfidf__use_idf': (True, False),
                       'tfidf__norm' :('l1','l2',None),
                       'clf-svm__tol': (1e-2, 1e-3),
                        'clf-svm__kernel' :('linear', 'poly', 'rbf', 'sigmoid')
         }
        gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
        gs_clf_svm = gs_clf_svm.fit(x_train, y_train)
        print gs_clf_svm.best_score_
        print gs_clf_svm.best_params_
        predicted_svm = gs_clf_svm.predict(x_test)
    else:
        _ = text_clf_svm.fit(x_train, y_train)
        predicted_svm = text_clf_svm.predict(x_test)
    if is_training and type(y_test) != type(None):
        return np.mean(predicted_svm == y_test)
    else:
        return predicted_svm


In [27]:
def boosting_classifier(x_train,y_train,x_test,y_test,apply_grid_search=False,is_training=True):
    text_clf_boosting = Pipeline([('vect', StemmedCountVectorizer(stop_words='english',ngram_range=(1,2))),
                          ('tfidf', TfidfTransformer(use_idf=True,norm='l1')),
                          ('clf-boosting', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=99,
                                                                                     min_samples_split=3,
                                                                                     criterion="gini",
                                                                                     splitter="best",
                                                                                     max_depth=11),
                                                              n_estimators=100,learning_rate=.1)),
     ])
    if apply_grid_search:
        parameters_boosting = {'vect__ngram_range': [(1, 1), (1, 2)],
                       'tfidf__use_idf': (True, False),
                       'tfidf__norm' :('l1','l2',None),
                       'clf-boosting__learning_rate': (.1, .2,.3),
         }
        gs_clf_boosting = GridSearchCV(text_clf_boosting, parameters_boosting, n_jobs=-1)
        gs_clf_boosting = gs_clf_boosting.fit(x_train,y_train)
        print gs_clf_boosting.best_score_
        print gs_clf_boosting.best_params_
        predicted_svm = gs_clf_boosting.predict(x_test)
    else:
        _ = text_clf_boosting.fit(x_train,y_train)
        predicted_boosting = text_clf_boosting.predict(x_test)
    if is_training and type(y_test) != type(None):
        return np.mean(predicted_boosting == y_test)
    else:
        return predicted_boosting


In [28]:
def sgd_classifier(x_train,y_train,x_test,y_test,apply_grid_search=False,is_training=True):
    text_clf_sgd = Pipeline([('vect', StemmedCountVectorizer(stop_words='english',ngram_range=(1,2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf-sgd', SGDClassifier(loss='hinge', penalty='l2',
                                                alpha=1e-3, n_iter=5, random_state=42)),
     ])
    if apply_grid_search:
        parameters_sgd = {'vect__ngram_range': [(1, 1), (1, 2)],
                       'tfidf__use_idf': (True, False),
                       'tfidf__norm' :('l1','l2',None),
                       'clf-sgd__alpha': (1e-2, 1e-3),
         }
        gs_clf_sgd = GridSearchCV(text_clf_sgd, parameters_sgd, n_jobs=-1)
        gs_clf_sgd = gs_clf_sgd.fit(x_train,y_train)
        print gs_clf_sgd.best_score_
        print gs_clf_sgd.best_params_
        predicted_sgd = gs_clf_sgd.predict(x_test)
    else:
        _ = text_clf_sgd.fit(x_train,y_train)
        predicted_sgd = text_clf_sgd.predict(x_test)
    if is_training and type(y_test) != type(None):
        return np.mean(predicted_sgd == y_test)
    else:
        return predicted_sgd

In [29]:
x_train,y_train,x_test,y_test = tfidf_transformation(data,test_data)

In [30]:
# print boosting_classifier(x_train,y_train,x_test,y_test,False)
multinomialnb_results = MultinomialNB_classifier(x_train,y_train,x_test,None,False,False)
# print svm_classifier(x_train,y_train,x_test,y_test, False)

In [31]:
sgd_results = sgd_classifier(x_train,y_train,x_test,None,False,False)



In [36]:
sgd_whole_test_data= test_data
sgd_whole_test_data['label'] = sgd_results
sgd_whole_test_data.to_csv('sgd_results.csv', index=False)

In [35]:
multinomialnb_whole_test_data= test_data
multinomialnb_whole_test_data['label'] = multinomialnb_results
multinomialnb_whole_test_data.to_csv('multinomialnb_results.csv', index=False)

In [None]:
# For using grid search to find the parameters and getting the accuracy on the train model
# Please note it will take a lot of time on laptops (Use servers Algorithms already optimized for multiple cores)
# x_train,y_train,x_test,y_test = tfidf_transformation(data,None)
# print MultinomialNB_classifier(x_train,y_train,x_test,y_test,True,True)
# print sgd_classifier(x_train,y_train,x_test,y_test,True,True)

# For just finding out the accuracy (without using grid search). It will follow the hard corded parameters only
# print MultinomialNB_classifier(x_train,y_train,x_test,y_test,False,True)
# print sgd_classifier(x_train,y_train,x_test,y_test,False,True)
