# Spooky Author Identification

In [1]:
import pandas as pd
pd.set_option('max_columns', None)

train = pd.read_csv("data/train.zip", index_col=['id'])
test = pd.read_csv("data/test.zip", index_col=['id'])
sample_submission = pd.read_csv("data/sample_submission.zip", index_col=['id'])

print(train.shape, test.shape, sample_submission.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1) (8392, 3)
{'author'}


In [2]:
train.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
test.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
id02310,"Still, as I urged our leaving Ireland with suc..."
id24541,"If a fire wanted fanning, it could readily be ..."
id00134,And when they had broken down the frail door t...
id27757,While I was thinking how I should possibly man...
id04081,I am not sure to what limit his knowledge may ...


In [4]:
sample_submission.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.403494,0.287808,0.308698
id24541,0.403494,0.287808,0.308698
id00134,0.403494,0.287808,0.308698
id27757,0.403494,0.287808,0.308698
id04081,0.403494,0.287808,0.308698


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

In [6]:
pipeline = Pipeline([
    ('features', CountVectorizer()),
    ('clf', LinearSVC())
])

cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3)

array([ 0.78783701,  0.79635305,  0.79509579])

In [7]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('features', CountVectorizer()),
    ('clf', RandomForestClassifier())
])

cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3)

array([ 0.61259191,  0.61630401,  0.60781609])

In [8]:
cross_val_score(pipeline, train.text, train.author, 
                cv=3, n_jobs=3, scoring='neg_log_loss')

array([-1.47875413, -1.27711926, -1.40480171])

In [9]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('features', CountVectorizer()),
    ('clf', LogisticRegression())
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.81449142  0.81673307  0.81348659]
[-0.47678328 -0.47558895 -0.47131481]


In [10]:
import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
print(len(stopwords))
print(stopwords)

153
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should',

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

params_count_word = {"features__ngram_range": [(1,1), (1,2), (1,3)],
                      "features__analyzer": ['word'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

params_count_char = {"features__ngram_range": [(1,4), (1,5), (1,6)],
                      "features__analyzer": ['char'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

NameError: name 'stopwords' is not defined

In [4]:
import numpy as np

def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [82]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
def random_search():
    params = {
        "clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
        "clf__class_weight": [None, 'balanced']
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', CountVectorizer()),
        ('clf', LogisticRegression())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

# random_search()

In [8]:
from sklearn.naive_bayes import MultinomialNB

def random_search():
    params = {
        "clf__alpha": [0.01, 0.1, 0.5, 1, 2]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

# random_search()  # Предишния най-добър резултат: -0.469

In [8]:
explore = train.copy()

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stem = PorterStemmer()

explore['stemmed'] = explore.text.apply(lambda t: " ".join([stem.stem(w) for w in t.split()])) 

In [17]:
def random_search():
    params = {
        "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(explore.stemmed, train.author)
    report(random_search.cv_results_)
    
random_search()  # -0.423

Model with rank: 1
Mean validation score: -0.426 (std: 0.003)
Parameters: {'features__max_df': 1.0, 'features__stop_words': None, 'clf__alpha': 0.05, 'features__min_df': 2, 'features__lowercase': True, 'features__ngram_range': (1, 3), 'features__analyzer': 'word'}

Model with rank: 2
Mean validation score: -0.435 (std: 0.003)
Parameters: {'features__max_df': 1.0, 'features__stop_words': None, 'clf__alpha': 0.1, 'features__min_df': 2, 'features__lowercase': False, 'features__ngram_range': (1, 2), 'features__analyzer': 'word'}

Model with rank: 3
Mean validation score: -0.450 (std: 0.004)
Parameters: {'features__max_df': 0.7, 'features__stop_words': None, 'clf__alpha': 0.1, 'features__min_df': 3, 'features__lowercase': False, 'features__ngram_range': (1, 2), 'features__analyzer': 'word'}

Model with rank: 4
Mean validation score: -0.450 (std: 0.003)
Parameters: {'features__max_df': 0.9, 'features__stop_words': None, 'clf__alpha': 0.01, 'features__min_df': 3, 'features__lowercase': True, 

In [25]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83195466  0.83466135  0.83187739]
[-0.42530307 -0.418245   -0.42500535]


Да пробваме с най-добрите параметри, които сегашното пускане на Random Search намери:

In [26]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 3), min_df=2,
                                 max_df=0.5, lowercase=False, analyzer='word')),
    ('clf', MultinomialNB(alpha=0.1))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83838848  0.84171008  0.84337165]
[-0.42373207 -0.42124757 -0.41789195]


In [5]:
import time
from sklearn.model_selection import GridSearchCV
def grid_search(text, params):
    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    search = GridSearchCV(pipeline, param_grid=params, 
                                    scoring='neg_log_loss', cv=3, n_jobs=6)

    search.fit(text, train.author)
    report(search.cv_results_)
    
    results = pd.DataFrame(search.cv_results_)
    save_filename = 'grid_search_results_' + str(time.time())
    print('Saving grid search results to', save_filename)
    results.to_csv(save_filename)

In [73]:
start_time = time.time()

params = {
    "features__ngram_range": [(1,1)],#, (1,2), (1,3)],
    "features__analyzer": ['word'],
    "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
    "features__min_df":[1, 2, 3, 5, 10],
    "features__lowercase": [False, True],
    "features__stop_words": [None],
    "features__norm": ['l1', 'l2', None],
    "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
}

# grid_search(explore.stemmed, params)

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.00010037422180175781 seconds ---


Much hope, no results.. За съжаление след много мъка не успях да подкарам Grid Search да се изпълни с толкова много параматри без да ми свърши RAM-та. Ще пробвам отново randomized search просто с повече итерации.

In [80]:
import time
from sklearn.model_selection import GridSearchCV
def random_search_with_save(text, params, n_iter):
    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                    scoring='neg_log_loss', cv=3, n_jobs=6, n_iter=n_iter)

    search.fit(text, train.author)
    report(search.cv_results_)
    
    results = pd.DataFrame(search.cv_results_)
    save_filename = 'random_search_results_' + str(time.time())
    print('Saving random search results to', save_filename)
    results.to_csv(save_filename)

In [88]:
start_time = time.time()

params = {
    "features__ngram_range": [(1,1), (1,2), (1,3), (1,4), (1,5)],
    "features__analyzer": ['word'],
    "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
    "features__min_df":[1, 2, 3, 5, 10],
    "features__lowercase": [False, True],
    "features__stop_words": [None],
    "features__norm": ['l1', 'l2', None],
    "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
}

for iteration in range(50):
    print('\n------------- ITERATION', iteration, '-------------')
    random_search_with_save(explore.stemmed, params, n_iter=20)

print("--- %s seconds ---" % (time.time() - start_time))


------------- ITERATION 0 -------------
Model with rank: 1
Mean validation score: -0.405 (std: 0.002)
Parameters: {'features__max_df': 0.9, 'clf__alpha': 0.05, 'features__min_df': 1, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: -0.477 (std: 0.002)
Parameters: {'features__max_df': 0.8, 'clf__alpha': 0.005, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 3)}

Model with rank: 3
Mean validation score: -0.486 (std: 0.002)
Parameters: {'features__max_df': 0.7, 'clf__alpha': 0.005, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 5)}

Model with rank: 4
Mean validation score: -0.527 (std: 0.001)
Parameters: {'features__m

Model with rank: 1
Mean validation score: -0.438 (std: 0.003)
Parameters: {'features__max_df': 0.5, 'clf__alpha': 0.1, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 5)}

Model with rank: 2
Mean validation score: -0.452 (std: 0.002)
Parameters: {'features__max_df': 1.0, 'clf__alpha': 0.01, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 3)}

Model with rank: 3
Mean validation score: -0.453 (std: 0.004)
Parameters: {'features__max_df': 0.6, 'clf__alpha': 0.1, 'features__min_df': 3, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 5)}

Model with rank: 4
Mean validation score: -0.467 (std: 0.008)
Parameters: {'features__max_df': 0.7, 'clf__alpha': 0.005, 'features__

Model with rank: 1
Mean validation score: -0.439 (std: 0.004)
Parameters: {'features__max_df': 0.6, 'clf__alpha': 0.005, 'features__min_df': 1, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: -0.480 (std: 0.005)
Parameters: {'features__max_df': 0.7, 'clf__alpha': 0.01, 'features__min_df': 5, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': False, 'features__analyzer': 'word', 'features__ngram_range': (1, 5)}

Model with rank: 3
Mean validation score: -0.517 (std: 0.003)
Parameters: {'features__max_df': 0.8, 'clf__alpha': 0.3, 'features__min_df': 5, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': False, 'features__analyzer': 'word', 'features__ngram_range': (1, 4)}

Model with rank: 4
Mean validation score: -0.520 (std: 0.006)
Parameters: {'features__max_df': 0.8, 'clf__alpha': 0.001, 'featur

Model with rank: 1
Mean validation score: -0.428 (std: 0.005)
Parameters: {'features__max_df': 0.9, 'clf__alpha': 0.01, 'features__min_df': 1, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 3)}

Model with rank: 2
Mean validation score: -0.452 (std: 0.002)
Parameters: {'features__max_df': 0.8, 'clf__alpha': 0.01, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 3)}

Model with rank: 3
Mean validation score: -0.453 (std: 0.005)
Parameters: {'features__max_df': 0.6, 'clf__alpha': 0.1, 'features__min_df': 3, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': False, 'features__analyzer': 'word', 'features__ngram_range': (1, 4)}

Model with rank: 4
Mean validation score: -0.477 (std: 0.003)
Parameters: {'features__max_df': 0.7, 'clf__alpha': 0.1, 'features__

Model with rank: 1
Mean validation score: -0.422 (std: 0.003)
Parameters: {'features__max_df': 0.9, 'clf__alpha': 0.05, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': False, 'features__analyzer': 'word', 'features__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: -0.438 (std: 0.004)
Parameters: {'features__max_df': 0.9, 'clf__alpha': 0.005, 'features__min_df': 1, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 2)}

Model with rank: 3
Mean validation score: -0.450 (std: 0.004)
Parameters: {'features__max_df': 0.6, 'clf__alpha': 0.1, 'features__min_df': 3, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': False, 'features__analyzer': 'word', 'features__ngram_range': (1, 2)}

Model with rank: 4
Mean validation score: -0.468 (std: 0.003)
Parameters: {'features__max_df': 0.5, 'clf__alpha': 0.05, 'feature

KeyboardInterrupt: 

In [90]:
saved_results = pd.read_csv('random_search_results_1512419845.3117762')
report(saved_results)

Model with rank: 1
Mean validation score: -0.405 (std: 0.002)
Parameters: {'features__max_df': 0.9, 'clf__alpha': 0.05, 'features__min_df': 1, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: -0.477 (std: 0.002)
Parameters: {'features__max_df': 0.8, 'clf__alpha': 0.005, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 3)}

Model with rank: 3
Mean validation score: -0.486 (std: 0.002)
Parameters: {'features__max_df': 0.7, 'clf__alpha': 0.005, 'features__min_df': 2, 'features__stop_words': None, 'features__norm': 'l2', 'features__lowercase': True, 'features__analyzer': 'word', 'features__ngram_range': (1, 5)}

Model with rank: 4
Mean validation score: -0.527 (std: 0.001)
Parameters: {'features__max_df': 0.8, 'clf__alpha': 0.005, 'featur

Да пробваме с новонамерените най-добри параметри:

In [89]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=1,
                                 max_df=0.9, lowercase=True, analyzer='word', norm='l2')),
    ('clf', MultinomialNB(alpha=0.05))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.85217525  0.85228317  0.84934866]
[-0.3919979  -0.39109396 -0.39412221]


Приятно подобрение. Имам нова идея за Grid Search, нека пробваме и нея и да събмитнем.

In [10]:
import time
from sklearn.model_selection import GridSearchCV
def grid_search(text, params):
    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    search = GridSearchCV(pipeline, param_grid=params, 
                                    scoring='neg_log_loss', cv=3, n_jobs=6)

    search.fit(text, train.author)
#     report(search.cv_results_)
    
    results = pd.DataFrame(search.cv_results_)
    save_filename = 'grid_search_results_' + str(time.time())
    print('Saving grid search results to', save_filename)
    results.to_csv(save_filename)
    return results

In [11]:
import numpy as np

def report_best_of_all(results, n_top=5):
    results = results.sort_values(by='mean_test_score', ascending=False)
    candidates = results.head(5)
    for pos_best, (_, candidate) in enumerate(candidates.iterrows()):
        print("Model with rank: {0}".format(pos_best+1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              candidate['mean_test_score'],
              candidate['std_test_score']))
        print("Parameters: {0}".format(candidate['params']))
        print("")

In [135]:
#with stemmed text
def grid_search_ngram_range(ngram_range):
    start_time = time.time()

    max_dfs = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
    min_dfs = [1, 2, 3, 5, 10]

    params = {
        "features__analyzer": ['word'],
        "features__lowercase": [False, True],
        "features__stop_words": [None],
        "features__norm": ['l1', 'l2', None],
        "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1, 2]
    }

    results = []
    for max_df in max_dfs:
        for min_df in min_dfs:
            print(ngram_range, max_df, min_df)
            params["features__ngram_range"] = [ngram_range]
            params["features__max_df"] = [max_df]
            params["features__min_df"] = [min_df]
            result = grid_search(explore.stemmed, params)
            results.append(result)
            print("--- %s seconds ---" % (time.time() - start_time))

    results_concatenated = pd.concat(results, ignore_index=True)
    report_best_of_all(results_concatenated)

    print("--- %s seconds ---" % (time.time() - start_time))
    
grid_search_ngram_range((1,1))

(1, 1) 1.0 1
Saving grid search results to grid_search_results_1512426187.9401221
--- 36.0008704662323 seconds ---
(1, 1) 1.0 2
Saving grid search results to grid_search_results_1512426224.1731737
--- 72.23167753219604 seconds ---
(1, 1) 1.0 3
Saving grid search results to grid_search_results_1512426260.9672508
--- 109.02562808990479 seconds ---
(1, 1) 1.0 5
Saving grid search results to grid_search_results_1512426298.4766703
--- 146.5349199771881 seconds ---
(1, 1) 1.0 10
Saving grid search results to grid_search_results_1512426334.6683414
--- 182.7278757095337 seconds ---
(1, 1) 0.9 1
Saving grid search results to grid_search_results_1512426372.020266
--- 220.07926058769226 seconds ---
(1, 1) 0.9 2
Saving grid search results to grid_search_results_1512426412.8781815
--- 260.9392509460449 seconds ---
(1, 1) 0.9 3
Saving grid search results to grid_search_results_1512426455.9829326
--- 304.04143500328064 seconds ---
(1, 1) 0.9 5
Saving grid search results to grid_search_results_1512426

In [136]:
grid_search_ngram_range((1,2))

(1, 2) 1.0 1
Saving grid search results to grid_search_results_1512427479.4382575
--- 102.81932425498962 seconds ---
(1, 2) 1.0 2
Saving grid search results to grid_search_results_1512427573.6548686
--- 197.033766746521 seconds ---
(1, 2) 1.0 3
Saving grid search results to grid_search_results_1512427662.6019924
--- 285.9807765483856 seconds ---
(1, 2) 1.0 5
Saving grid search results to grid_search_results_1512427750.6143012
--- 373.99206256866455 seconds ---
(1, 2) 1.0 10
Saving grid search results to grid_search_results_1512427837.7832794
--- 461.1602518558502 seconds ---
(1, 2) 0.9 1
Saving grid search results to grid_search_results_1512427936.3800635
--- 559.7603869438171 seconds ---
(1, 2) 0.9 2
Saving grid search results to grid_search_results_1512428028.11338
--- 651.4921116828918 seconds ---
(1, 2) 0.9 3
Saving grid search results to grid_search_results_1512428122.067634
--- 745.4462003707886 seconds ---
(1, 2) 0.9 5
Saving grid search results to grid_search_results_1512428218

In [137]:
grid_search_ngram_range((1,3))

(1, 3) 1.0 1
Saving grid search results to grid_search_results_1512430400.1583655
--- 170.4003357887268 seconds ---
(1, 3) 1.0 2
Saving grid search results to grid_search_results_1512430551.652627
--- 321.8888840675354 seconds ---
(1, 3) 1.0 3
Saving grid search results to grid_search_results_1512430699.217702
--- 469.44824028015137 seconds ---
(1, 3) 1.0 5
Saving grid search results to grid_search_results_1512430843.3834398
--- 613.6137456893921 seconds ---
(1, 3) 1.0 10
Saving grid search results to grid_search_results_1512430991.1406446
--- 761.3688488006592 seconds ---
(1, 3) 0.9 1
Saving grid search results to grid_search_results_1512431162.1948178
--- 932.4381420612335 seconds ---
(1, 3) 0.9 2
Saving grid search results to grid_search_results_1512431315.9268553
--- 1086.1584510803223 seconds ---
(1, 3) 0.9 3
Saving grid search results to grid_search_results_1512431465.4162474
--- 1235.6493589878082 seconds ---
(1, 3) 0.9 5
Saving grid search results to grid_search_results_1512431

In [None]:
grid_search_ngram_range((1,4))

(1, 4) 1.0 1
Saving grid search results to grid_search_results_1512435135.7461748
--- 256.1924467086792 seconds ---
(1, 4) 1.0 2
Saving grid search results to grid_search_results_1512435363.124219
--- 483.54827761650085 seconds ---
(1, 4) 1.0 3
Saving grid search results to grid_search_results_1512435591.3487787
--- 711.7688000202179 seconds ---
(1, 4) 1.0 5
Saving grid search results to grid_search_results_1512435812.0980244
--- 932.5234990119934 seconds ---
(1, 4) 1.0 10
Saving grid search results to grid_search_results_1512436035.761526
--- 1156.1834888458252 seconds ---
(1, 4) 0.9 1
Saving grid search results to grid_search_results_1512436298.6903434
--- 1419.133519411087 seconds ---
(1, 4) 0.9 2
Saving grid search results to grid_search_results_1512436525.4579515
--- 1645.888031721115 seconds ---
(1, 4) 0.9 3
Saving grid search results to grid_search_results_1512436750.4521673
--- 1870.8701527118683 seconds ---
(1, 4) 0.9 5
Saving grid search results to grid_search_results_1512436

In [None]:
grid_search_ngram_range((1,5))

(1, 5) 1.0 1


In [11]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=1,
                                 max_df=0.5, lowercase=False, analyzer='word', norm='l2')),
    ('clf', MultinomialNB(alpha=0.05))
])

print(cross_val_score(pipeline, explore.stemmed, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, explore.stemmed, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.83992034  0.84600061  0.84229885]
[-0.40539568 -0.40318401 -0.40505574]


In [13]:
def grid_search_ngram_range_original_text(ngram_range):
    start_time = time.time()

    max_dfs = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
    min_dfs = [1, 2, 3]

    params = {
        "features__analyzer": ['word'],
        "features__lowercase": [False, True],
        "features__stop_words": [None],
        "features__norm": ['l2'],
        "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 1]
    }

    results = []
    for max_df in max_dfs:
        for min_df in min_dfs:
            print(ngram_range, max_df, min_df)
            params["features__ngram_range"] = [ngram_range]
            params["features__max_df"] = [max_df]
            params["features__min_df"] = [min_df]
            result = grid_search(train.text, params)
            results.append(result)
            print("--- %s seconds ---" % (time.time() - start_time))

    results_concatenated = pd.concat(results, ignore_index=True)
    report_best_of_all(results_concatenated)

    print("--- %s seconds ---" % (time.time() - start_time))
    
grid_search_ngram_range_original_text((1,2))

(1, 2) 1.0 1
Saving grid search results to grid_search_results_1512480129.0763836
--- 63.37544131278992 seconds ---
(1, 2) 1.0 2
Saving grid search results to grid_search_results_1512480188.0675588
--- 122.40183138847351 seconds ---
(1, 2) 1.0 3
Saving grid search results to grid_search_results_1512480243.3354244
--- 177.6348433494568 seconds ---
(1, 2) 0.9 1
Saving grid search results to grid_search_results_1512480305.0062802
--- 239.3092577457428 seconds ---
(1, 2) 0.9 2
Saving grid search results to grid_search_results_1512480364.1854217
--- 298.52094650268555 seconds ---
(1, 2) 0.9 3
Saving grid search results to grid_search_results_1512480423.2814221
--- 357.57992911338806 seconds ---
(1, 2) 0.8 1
Saving grid search results to grid_search_results_1512480484.8736901
--- 419.17773509025574 seconds ---
(1, 2) 0.8 2
Saving grid search results to grid_search_results_1512480544.3826375
--- 478.6832025051117 seconds ---
(1, 2) 0.8 3
Saving grid search results to grid_search_results_15124

In [14]:
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=1,
                                 max_df=0.5, lowercase=False, analyzer='word', norm='l2')),
    ('clf', MultinomialNB(alpha=0.05))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

[ 0.85416667  0.85335581  0.8502682 ]
[-0.38995613 -0.38833216 -0.39106993]


Получихме същите параметри и дори по-добър резултат.

In [15]:
pipeline = pipeline.fit(train.text, train.author)

In [16]:
test_predictions = pipeline.predict_proba(test.text)

In [17]:
pipeline.classes_

array(['EAP', 'HPL', 'MWS'],
      dtype='<U3')

In [18]:
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'HPL', 'MWS'], index=test.index)
submit_file.head(10)

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.085998,0.009646,0.904357
id24541,0.946239,0.040561,0.013199
id00134,0.025347,0.963843,0.01081
id27757,0.693222,0.293976,0.012802
id04081,0.694155,0.202899,0.102946
id27337,0.972891,0.026107,0.001002
id24265,0.956902,0.027761,0.015337
id25917,0.016713,0.046692,0.936595
id04951,0.991779,0.00796,0.000261
id14549,0.647427,0.189868,0.162704


In [19]:
submit_file.to_csv("predictions.csv")