In [41]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import mglearn
import nltk
from nltk.stem import PorterStemmer
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from nltk.tag import pos_tag, map_tag
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.base import TransformerMixin
import re

%matplotlib inline
train = pd.read_csv("spooky-authors/train.zip", index_col=['id'])
test = pd.read_csv("spooky-authors/test.zip", index_col=['id'])
sample_submission = pd.read_csv("spooky-authors/sample_submission.zip", index_col=['id'])

In [2]:
print(train.shape)
print(test.shape)

(19579, 2)
(8392, 1)


Идеята е да ползвам LDA, за да открия n на брой теми. С малко късмет различните автори ще са писали по различни теми и това ще помогне за тяхното идентифициране.

Използвам LDA с 20 теми (няма смисъл от повече, резултата не се променя особено).

In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import coo_matrix, hstack

vectorizer = CountVectorizer(max_df=.15, max_features=10000)
X = vectorizer.fit_transform(train.text)
lda = LatentDirichletAllocation(n_components=20, 
                                learning_method="batch", max_iter=15, random_state=0)
topics = lda.fit_transform(X)
topics.shape

(19579, 20)

Първо пробвам да предрека автора само с генерираните теми. 

In [10]:
def get_text(df):
    return df["text"]

In [38]:
tfidf_vectorizer = make_union(*[
    make_pipeline(FunctionTransformer(get_text, validate=False), TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                                                                 max_df=0.8, lowercase=False)),
])

lda_f = LatentDirichletAllocation(n_components=20,
                                learning_method="batch", max_iter=15, random_state=0)
vectorizer_f = CountVectorizer(max_df=.15, max_features=10000)
lda_features = make_union(*[
    make_pipeline(FunctionTransformer(get_text, validate=False),
                  vectorizer_f,
                  lda_f)
])
tfidf_lda = make_union(*[
    tfidf_vectorizer,
    lda_features
])

pipeline_lda = make_pipeline(lda_features, MultinomialNB(alpha=0.01))
pipeline_tfidf_lda = make_pipeline(tfidf_lda, MultinomialNB(alpha=0.01))
pipeline_tfidf = make_pipeline(tfidf_vectorizer, MultinomialNB(alpha=0.01))

In [14]:
print(cross_val_score(pipeline_lda, train, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_lda, train, train.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.5513174   0.52129942  0.5394636 ]
[-0.96128556 -0.99249138 -0.98830282]


Не е много впечатляващ резултат. Да видим какви теми намери LDA.

In [17]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vectorizer.get_feature_names())
mglearn.tools.print_topics(topics=range(15), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
on            at            by            on            his           
from          or            an            by            at            
were          three         upon          day           on            
they          this          this          house         eyes          
through       about         or            morning       door          
down          two           his           from          into          
out           upon          at            next          up            
by            time          from          this          from          
up            five          one           him           room          
moon          one           is            so            upon          


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
all 

Прави ми впечатление, че повечето теми съдържат главно стоп думи, които са популярни за всеки текст. Може би е добра идея да се премахнат. 

Така и така ще ги махаме направо и един стемер да ударим.

In [19]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()
explore = train.copy()
explore['no_stop'] = explore.text.apply(lambda s: " ".join([stemmer.stem(w) for w in str(s).split() if w.lower() not in stopwords]))

In [29]:
def get_no_stop(df):
    return df["no_stop"]

In [31]:
lda_features_no_stop = make_union(*[
    make_pipeline(FunctionTransformer(get_no_stop, validate=False),
                  vectorizer_f,
                  lda_f)
])
pipeline_lda_no_stop = make_pipeline(lda_features_no_stop, MultinomialNB(alpha=0.01))

In [32]:
print(cross_val_score(pipeline_lda_no_stop, explore, explore.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_lda_no_stop, explore, explore.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.50903799  0.47931352  0.46329502]
[-1.00826273 -1.02304674 -1.03234007]


Резултата се влоши. Да видим новите теми.

In [34]:
vectorizer = CountVectorizer(max_df=.15, max_features=10000)
X = vectorizer.fit_transform(explore.no_stop)
lda = LatentDirichletAllocation(n_components=20, 
                                learning_method="batch", max_iter=15, random_state=0)
lda.fit(X)
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vectorizer.get_feature_names())
mglearn.tools.print_topics(topics=range(15), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
night         time          would         one           upon          
upon          power         could         would         may           
wind          could         me            say           littl         
sound         upon          one           could         found         
pass          everi         time          matter        also          
came          spent         english       fact          despair       
time          him           mani          however       then          
heard         natur         voic          even          turn          
hour          whose         yet           mere          even          
seem          first         seem          upon          everi         


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
an  

Сега изглеждат по-добре, но резултата се влоши. Най-вероятно въпреки, че изглеждат по релевантни, темите сега са по-общи за авторите. 

Да видим как ще се справя в комбинация с Bag Of Words.

In [37]:
print(cross_val_score(pipeline_tfidf_lda, train, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_tfidf_lda, train, train.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83011642  0.83588722  0.83264368]
[-0.4305994  -0.42365256 -0.43169241]


Резултата отново не е впечатляващ. Да сравним с чист Bag Of Words.

In [39]:
print(cross_val_score(pipeline_tfidf, train, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_tfidf, train, train.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83195466  0.83466135  0.83187739]
[-0.42530307 -0.418245   -0.42500535]


Няма подобрение. Дори е малко по-зле.

Ще пробвам да добавя feature среден брой думи в изречение. Също така ще пробвам да преброя различните части на речта във всяко изречение, т.е. ще ползвам pos tagger.


In [129]:
def get_average_words_per_sentence(sentences):
        count_words = []
        for sentence in sentences:
            count_words.append(len(sentence.split(" ")))
        return sum(count_words) / len(count_words)

In [157]:
def add_average_number_of_words(df):
    df_new = df.copy()
    df_new["average_words_per_sentence"] = df_new.apply(
            lambda x: get_average_words_per_sentence(re.split(r"[.?!]", x["text"])), axis=1)
    return df_new

In [131]:
TAGS = ["ADJ", "ADP", "ADV", "CONJ", "DET", "NOUN", "NUM", "PRT", "PRON", "VERB", ".", "X"]

In [132]:
def count_all_tags(text):
        counter = defaultdict(int)
        tagged = nltk.pos_tag(nltk.word_tokenize(text), tagset="universal")
        for _, tag in tagged:
            counter[tag] += 1
        return counter

In [153]:
def add_pos_features(df, tags):
    df_new = df.copy()
    for tag in tags:
        df_new[tag] = np.zeros(df_new.shape[0]).astype(int)
        
    for i, row in df_new.iterrows():
        tags_counted = count_all_tags(row.text)
        for tag, frequency in tags_counted.items():
            df_new.set_value(i, tag, frequency)
    return df_new

По принцип nltk.pos_tag тагва с много прецизни тагове, които са и много наброй. Реших, че е достатъчно да ползвам по-общи категории, затова ползвам tagset="universal". В този съкратен тагсет повечето тагове са self-explanatory с изключение на "." и "X". С точката се тагва всякаква пунктоация, като например .,?! и тн. X са всички думи, които са неизвестни за тагера, например думи, които не са на английски език.

Да видим как се справя модела с новите features.

In [155]:
train_pos_tagged = add_pos_features(train, TAGS)
test_pos_tagged = add_pos_features(test, TAGS)
train_pos_tagged.head()

Unnamed: 0_level_0,text,author,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,.,X
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
id26305,"This process, however, afforded me no means of...",EAP,2,5,3,1,6,10,0,2,5,7,7,0
id17569,It never once occurred to me that the fumbling...,HPL,1,1,2,0,2,2,0,1,2,3,1,0
id11008,"In his left hand was a gold snuff box, from wh...",EAP,5,6,1,0,6,10,0,1,3,4,5,0
id27763,How lovely is spring As we looked from Windsor...,MWS,6,6,2,2,2,10,0,0,1,5,4,0
id12958,"Finding nothing else, not even gold, the Super...",HPL,1,3,4,1,2,7,0,0,4,5,4,0


In [158]:
train_pos_avg = add_average_number_of_words(train_pos_tagged)
test_pos_avg = add_average_number_of_words(test_pos_tagged)
train_pos_avg.head()

Unnamed: 0_level_0,text,author,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,.,X,average_words_per_sentence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
id26305,"This process, however, afforded me no means of...",EAP,2,5,3,1,6,10,0,2,5,7,7,0,21.0
id17569,It never once occurred to me that the fumbling...,HPL,1,1,2,0,2,2,0,1,2,3,1,0,7.5
id11008,"In his left hand was a gold snuff box, from wh...",EAP,5,6,1,0,6,10,0,1,3,4,5,0,18.5
id27763,How lovely is spring As we looked from Windsor...,MWS,6,6,2,2,2,10,0,0,1,5,4,0,17.5
id12958,"Finding nothing else, not even gold, the Super...",HPL,1,3,4,1,2,7,0,0,4,5,4,0,14.0


In [47]:
def drop_author_text(df):
    return df.drop(["author", "text"], axis=1)

In [160]:
tfidf = make_union(*[
    make_pipeline(FunctionTransformer(get_text, validate=False),
                                  TfidfVectorizer(ngram_range=(1, 2), min_df=2, 
                                                              max_df=0.8, lowercase=False)),
])
pipeline_tfidf = make_pipeline(tfidf, MultinomialNB(alpha=0.01))

In [162]:
print(cross_val_score(pipeline_tfidf, train_pos_avg.drop(["author"], axis=1), train_pos_tagged.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_tfidf, train_pos_avg.drop(["author"], axis=1), train_pos_tagged.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83195466  0.83466135  0.83187739]
[-0.42530307 -0.418245   -0.42500535]


Резултатът се влоши. Да видим какъв е резултата без tfidf.

In [163]:
print(cross_val_score(MultinomialNB(alpha=0.01), train_pos_avg.drop(["author", "text"], axis=1), train_pos_tagged.author, cv=3, n_jobs=3))
print(cross_val_score(MultinomialNB(alpha=0.01), train_pos_avg.drop(["author", "text"], axis=1), train_pos_tagged.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.52864583  0.52896108  0.53195402]
[-0.99589232 -0.99010315 -0.98235638]


In [164]:
print(cross_val_score(LogisticRegression(), train_pos_avg.drop(["author", "text"], axis=1), train_pos_tagged.author, cv=3, n_jobs=3))
print(cross_val_score(LogisticRegression(), train_pos_avg.drop(["author", "text"], axis=1), train_pos_tagged.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.54718137  0.55194606  0.56076628]
[-0.9443998  -0.93700313 -0.93107771]


Резултатът с логистична регресия е малко по-добър. 

Ще опитам да комбинирам LDA с пос таговете и средния брой думи.

In [178]:
lda_avg = make_union(*[
    lda_features,
    make_pipeline(FunctionTransformer(drop_author_text, validate=False))
])
pipeline_lda_avg = make_pipeline(lda_avg, LogisticRegression(C=100))

In [166]:
print(cross_val_score(pipeline_lda_avg, train_pos_avg, train_pos_tagged.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_lda_avg, train_pos_avg, train_pos_tagged.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.6395527   0.62886914  0.63019157]
[-0.80370316 -0.84348148 -0.82907251]


Заедно все пак са по-добре от колкото отделно. Опитвам и с MultinomialNB.

In [167]:
pipeline_lda_avg_nb = make_pipeline(lda_avg, MultinomialNB(alpha=0.01))
print(cross_val_score(pipeline_lda_avg_nb, train_pos_tagged, train_pos_tagged.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_lda_avg_nb, train_pos_tagged, train_pos_tagged.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.5859375   0.57814894  0.57563218]
[-0.90826364 -0.9333977  -0.92121947]


Резултата е по зле.

Зачетох се в тази [статия](http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/) и реших да пробвам model stacking (макар и само с един модел). Идеята е да се сметнат вероятностите само с tfidf векторизиране и после те да се ползват като фийчъри. 

In [62]:
def stack_model(train, test, model, splits):
    kf = StratifiedKFold(n_splits=splits, shuffle=True)
    proba_train = pd.np.zeros([train.shape[0], 3])
    proba_test = pd.np.zeros([test.shape[0], 3])
    y = train.author
    for train_indices, test_indices in kf.split(train, y):
        split_train_x, split_train_y = train.iloc[train_indices], y.iloc[train_indices]
        split_test_x =  train.iloc[test_indices]
        
        model.fit(split_train_x, split_train_y)
        proba_train[test_indices] = model.predict_proba(split_test_x)
        proba_test += model.predict_proba(test)
    proba_test /= splits
    train_df = pd.DataFrame(proba_train, columns=['EAP', 'HPL', 'MWS'], index=train.index)
    test_df = pd.DataFrame(proba_test, columns=['EAP', 'HPL', 'MWS'], index=test.index)
    return train_df, test_df

In [63]:
train_proba, test_proba = stack_model(train, test, pipeline_tfidf, 10)

In [64]:
train_proba.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id26305,0.994668,0.000746,0.004586
id17569,0.599233,0.221939,0.178828
id11008,0.922528,0.077064,0.000407
id27763,0.000387,0.000252,0.99936
id12958,0.34018,0.519122,0.140698


In [168]:
stacked_train = pd.concat([train_pos_avg, train_proba], axis=1)
stacked_test = pd.concat([test_pos_avg, test_proba], axis=1)

In [169]:
stacked_train.head()

Unnamed: 0_level_0,text,author,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,.,X,average_words_per_sentence,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
id26305,"This process, however, afforded me no means of...",EAP,2,5,3,1,6,10,0,2,5,7,7,0,21.0,0.994668,0.000746,0.004586
id17569,It never once occurred to me that the fumbling...,HPL,1,1,2,0,2,2,0,1,2,3,1,0,7.5,0.599233,0.221939,0.178828
id11008,"In his left hand was a gold snuff box, from wh...",EAP,5,6,1,0,6,10,0,1,3,4,5,0,18.5,0.922528,0.077064,0.000407
id27763,How lovely is spring As we looked from Windsor...,MWS,6,6,2,2,2,10,0,0,1,5,4,0,17.5,0.000387,0.000252,0.99936
id12958,"Finding nothing else, not even gold, the Super...",HPL,1,3,4,1,2,7,0,0,4,5,4,0,14.0,0.34018,0.519122,0.140698


In [177]:
print(cross_val_score(LogisticRegression(C=100), stacked_train.drop(["text", "author"], axis=1), stacked_train.author, cv=3, n_jobs=3))
print(cross_val_score(LogisticRegression(C=100), stacked_train.drop(["text", "author"], axis=1), stacked_train.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.85401348  0.85442844  0.8559387 ]
[-0.40179466 -0.39357812 -0.39269627]


Това подобри резултата!

Пробвах няколко C-та на ръка, със 100 има най-добър резултат.

Да видим с lda дали няма да се подобри.

In [179]:
print(cross_val_score(pipeline_lda_avg, stacked_train, stacked_train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_lda_avg, stacked_train, stacked_train.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.85278799  0.85688017  0.85471264]
[-0.39859616 -0.39149199 -0.38997119]


log_loss-а падна малко

Последна идея за подобрение. От разгледаните фийчъри на лекции забелязах (след доста пробване), че unique_words дава най-добри резултати. Ще пробвам да го добавя.

In [180]:
stacked_train["unique_words"] = stacked_train.text.apply(lambda s: len(set(str(s).split())))
stacked_test["unique_words"] = stacked_test.text.apply(lambda s: len(set(str(s).split())))

In [181]:
print(cross_val_score(pipeline_lda_avg, stacked_train, stacked_train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline_lda_avg, stacked_train, stacked_train.author,cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.85278799  0.8584125   0.8559387 ]
[-0.3937938  -0.38883294 -0.38583781]


Има малко подобрение. Време е за събмит.

In [76]:
def drop_text(df):
    return df.drop(["text"], axis=1)

In [182]:
avg_submit = make_union(*[
    lda_features,
    make_pipeline(FunctionTransformer(drop_text, validate=False))
])
pipeline_submit = make_pipeline(avg_submit, LogisticRegression(C=100))
pipeline_submit.fit(stacked_train.drop(["author"], axis=1), stacked_train.author)
prediction = pipeline_submit.predict_proba(stacked_test)
submit_file = pd.DataFrame(prediction, columns=['EAP', 'HPL', 'MWS'], index=test.index)
submit_file.head(10)
submit_file.to_csv("~/Desktop/spooky_prediction.csv")

Резултата в кагъл е:

<img src="img/spooky.png"/>