In [1]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from collections import Counter

import nltk
import string
import numpy as np
import warnings
import re
import pandas as pd

stopwords = set(stopwords.words('english'))
pd.options.display.max_columns = 999
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
train = pd.read_csv('open/train.csv', encoding='utf-8', delimiter=',')
test = pd.read_csv('open/test_x.csv', encoding='utf-8', delimiter=',')

In [3]:
lemmatiser = WordNetLemmatizer()

def text_process(tex):
    
    nopunct=[char for char in tex if char not in string.punctuation]
    nopunct=''.join(nopunct)
    
    a=''
    i=0
    
    for i in range(len(nopunct.split())):
        b=lemmatiser.lemmatize(nopunct.split()[i], pos="v")
        a=a+b+' '
        
    return [word for word in a.split() if word.lower() not 
            in stopwords.words('english')]

#wordnet_lemmatizer = WordNetLemmatizer()
#stemmer = PorterStemmer()

def tokenize_lemma_stopwords(text):
    text = text.replace("\n", " ")
    # split string into words (tokens)
    tokens = nltk.tokenize.word_tokenize(text.lower())
    # keep strings with only alphabets
    tokens = [t for t in tokens if t.isalpha()]
    # put words into base form
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] 
    tokens = [stemmer.stem(t) for t in tokens]
    # remove short words, they're probably not useful
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    cleanedText = " ".join(tokens)
    return cleanedText

def dataCleaning(df):
    data = df.copy()
    data["text"] = data["text"].apply(tokenize_lemma_stopwords)
    return data

def decontract(sentence):
    # specific
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can\'t", "can not", sentence)

    # general
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

def cleanPunc(sentence): 
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub("", sentence)

# stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
#             "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
#             'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
#             'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
#             'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
#             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
#             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
#             'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
#             'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
#             'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
#             's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
#             've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
#             "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
#             "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
#             'won', "won't", 'wouldn', "wouldn't"])

re_stop_words = re.compile(r"\b(" + "|".join(stopwords) + ")\\W", re.I)

stemmer = SnowballStemmer("english")

def stemming(sentence):
    stemSentence = ""
    
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    
    return stemSentence

In [None]:
cleanedTrainData = dataCleaning(train)
cleanedTestData = dataCleaning(test)

In [4]:
train['text'] = train['text'].str.lower()
train['text'] = train['text'].apply(decontract)
train['text'] = train['text'].apply(cleanPunc)
train['text'] = train['text'].apply(keepAlpha)
train['text'] = train['text'].apply(removeStopWords)
train['text'] = train['text'].apply(stemming)

test['text'] = test['text'].str.lower()
test['text'] = test['text'].apply(decontract)
test['text'] = test['text'].apply(cleanPunc)
test['text'] = test['text'].apply(keepAlpha)
test['text'] = test['text'].apply(removeStopWords)
test['text'] = test['text'].apply(stemming)

In [5]:
# X = train['text']
# y = train['author']
X_train = train['text']
y_train = train['author']
X_test = test['text']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

bow_transformer=CountVectorizer(analyzer=text_process).fit(X_train)
text_bow_train=bow_transformer.transform(X_train)
text_bow_test=bow_transformer.transform(X_test)
real_text_test = bow_transformer.transform(test['text'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

bow_transformer=TfidfVectorizer(stop_words=stop_words).fit(X_train)
text_bow_train=bow_transformer.transform(X_train)
text_bow_test=bow_transformer.transform(X_test)
real_text_test = bow_transformer.transform(test['text'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234, shuffle=True)

bow_transformer=TfidfVectorizer().fit(X_train)
text_bow_train=bow_transformer.transform(X_train)
text_bow_test=bow_transformer.transform(X_test)
real_text_test = bow_transformer.transform(test['text'])

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(cleanedTrainData, y, test_size=0.2, random_state=1234)

bow_transformer = TfidfVectorizer()
text_bow_train=bow_transformer.fit_transform(cleanedTrainData['text'])
#text_bow_test=bow_transformer.fit_transform(X_test)
real_text_test = bow_transformer.transform(cleanedTestData['text'])

In [None]:
seeds = [1, 43, 678, 90, 135]

X_train, X_test, y_train, y_test = train_test_split(train['text'], 
                                                    train['author'], 
                                                    test_size=0.3, 
                                                    random_state=seeds[4], 
                                                    shuffle=True)

vectorizer = TfidfVectorizer(strip_accents='unicode', ngram_range=(1,3)).fit(X_train)
#vectorizer = TfidfVectorizer(stop_words=stopwords).fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

real_text_test = vectorizer.transform(test['text'])

In [None]:
model = MultinomialNB()
model = model.fit(text_bow_train, y_train)
#0.6828084321

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
model = model.fit(text_bow_train, y_train)
#2.6387692872

In [None]:
model = OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))
model = model.fit(text_bow_train, y_train)
#0.6695200716

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', probability=True)
model = model.fit(text_bow_train, train['author'])
#0.5319915652

In [None]:
LR_pipeline = Pipeline([('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1))])
LR_pipeline = LR_pipeline.fit(X_train, y_train)

prediction = LR_pipeline.predict(X_test)
accuracy_score(y_test, prediction)

In [None]:
NB_pipeline = Pipeline([('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None)))])
NB_pipeline = NB_pipeline.fit(X_train, y_train)

prediction = NB_pipeline.predict(X_test)
accuracy_score(y_test, prediction)

In [None]:
SVC_pipeline = Pipeline([('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1))])
SVC_pipeline = SVC_pipeline.fit(X_train, y_train)

prediction = SVC_pipeline.predict(X_test)
accuracy_score(y_test, prediction)

In [None]:
pred = SVC_pipeline.predict(real_text_test)

In [None]:
# pred = SVC_pipeline.predict_proba(real_text_test)

In [None]:
pred = model.predict(real_text_test)

In [None]:
sample_submission = pd.read_csv('open/sample_submission.csv', encoding = 'utf-8', delimiter=',')

In [None]:
sample_submission.head()

In [None]:
sample_submission[['0','1','2','3','4']] = pred

In [None]:
sample_submission.to_csv('submission.csv', index = False, encoding = 'utf-8', sep=',')