### [Спорт 2018 весна] hw0
## Nikita Fomin

In [50]:
import re

import pandas as pd
import numpy as np
import scipy as sp

from xgboost import XGBClassifier

from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import log_loss, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import Normalizer, normalize, StandardScaler

from natasha import NamesExtractor
from pymorphy2 import MorphAnalyzer

In [51]:
pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

In [52]:
train = pd.read_csv('train.csv')
real_test = pd.read_csv('test.csv')['Word']

In [53]:
# Убрать дубликаты с разными лейблами

train['WordLower'] = train['Word'].apply(str.lower)

train['Duplicate'] = train.duplicated(subset='Word', keep=False)
train['DuplicateLowerWord'] = train.duplicated(subset='WordLower', keep=False)
train['DuplicateLowerWordLabel'] = train.duplicated(subset=['WordLower', 'Label'], keep=False)

train = train[(train['DuplicateLowerWord'] == False) | (((train['DuplicateLowerWord'] == True)) & ((train['DuplicateLowerWordLabel'] == True)))]
train.drop(['WordLower', 'Duplicate', 'DuplicateLowerWord', 'DuplicateLowerWordLabel'], axis=1, inplace=True)

In [54]:
# Сделать отложенную выборку

# np.random.seed(890)
# msk = np.random.rand(len(train)) < 0.333

# deferred_test = train[~msk]
# y_deferred_true = deferred_test['Label']
# X_deferred = deferred_test.drop('Label', axis=1)
# train = train[msk]

# print(len(deferred_test))
# del deferred_test
# y_deferred_true.value_counts()

In [55]:
# Искуственно уравняем классы

majority = train[train['Label'] == 0]
minority = train[train['Label'] == 1]
 
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=348) 
train = pd.concat([majority, minority_upsampled])

In [56]:
class WordFeatures(BaseEstimator, TransformerMixin):
    """Для извлечения фич из слов"""
    
    def fit(self, x, y=None):
        return self

    def transform(self, words):
        
        def pymorphy_check(x):
            """Является ли именем по версии pymorphy"""
            return int('Name' in morph.parse(x)[0].tag)
    
        def natasha_check(x):
            """Является ли именем по версии natasha"""
            return int(bool(extractor(x)))
        
        def check_upper(x):
            """Есть ли заглавные буквы среди символов строки"""
            return any(map(str.isupper, x))
        
        def is_noun(x):
            """Является ли существительным"""
            return int('NOUN' in morph.parse(x)[0].tag)
    
        extractor = NamesExtractor()
        morph = MorphAnalyzer()
        
        typical_endings = ("ев", "ов", "ских", "ко","заде", "ли", "лы", "оглу", 
                           "кызы", "ян", "янц", "уни", "ич", "ов", "ук", "ик", "ски", 
                            "ка", "ини", "ук", "юк", "ун", "ний", "ный", "чай", "ий", "а", 
                            "ишин", "ску", "ул", "ан", "цки", "ман", "ер", "те", "ис", "не", "пулос", 
                            "кос", "иди", "швили", "дзе", "ури", 
                            "иа", "уа", "ава", "ли", "си", "ни", "огло")
        
        
        df = words.to_frame(name='word')
        df['word_lower'] = df['word'].apply(str.lower)
        
        df['length'] = df['word'].apply(len)
        df['is_letter'] = df['word'].apply(lambda x: 1 if len(x) == 1 else 0)
        
        df['is_name_pymorpy'] = df['word'].apply(pymorphy_check)
        df['is_name_natasha'] = df['word'].apply(natasha_check)
        df['is_noun'] = df['word'].apply(is_noun)
        
        df['typical_ending'] = df['word_lower'].apply(lambda x: int(x.lower().endswith(typical_endings)))
        
        df['cnt_vowels'] = df['word_lower'].apply(lambda x: len(re.findall('[аоэиуыеёюя]', x)))
        df['cnt_consonants'] = df['word_lower'].apply(lambda x: len(re.findall('[бвгджзйклмнпрстфхцчшщ]', x)))
        
        df['signs'] = df['word_lower'].apply(lambda x: len(re.findall('[ьъ]', x)))
        
        df['digits'] = df['word'].apply(lambda x: len(re.findall('[0-9]', x)))
        df['symbols'] = df['word'].apply(lambda x: len(x) - len(re.sub(r'[^\w\s]', '', x)))
        
        df['dot'] = df['word'].apply(lambda x: 1 if x.find('.') != -1 else 0)
        df['apostrophe'] = df['word'].apply(lambda x: 1 if x.find('`') != -1 or x.find('\'') != -1 else 0)
        
        df['is_upper'] = df['word'].apply(lambda x: int(x.isupper()))
        df['is_lower'] = df['word'].apply(lambda x: int(x.islower()))
        
        df['is_first_upper'] = df['word'].apply(lambda x: 1 if x[0].isupper() and x[1:].islower() else 0)
        df['is_first_lower'] = df['word'].apply(lambda x: 1 if x[0].islower() and check_upper(x[1:]) else 0)
        
        df.drop(['word', 'word_lower'], axis=1, inplace=True)
        df = df.to_dict('records')
        return df

In [57]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            ('n_grams', Pipeline([
                ('vect', CountVectorizer(analyzer='char_wb', ngram_range=(2, 7))),
                ('tfidf', TfidfTransformer(use_idf=True)),
            ])),

            ('features', Pipeline([
                ('features_selector', WordFeatures()),
                ('vect', DictVectorizer()),
            ])),
        ],
        
        transformer_weights={
            'n_grams': 1.0,
            'features': 1.0,
        },
    )),
    
    ('scaler', Normalizer(norm='l2')),
    ('clf', MultinomialNB()),
#     ('clf', RandomForestClassifier(n_estimators=50, random_state=197, n_jobs=-1, max_features='auto')),
#     ('clf', XGBClassifier(n_estimators=150, max_depth=15, seed=10, objective='binary:logistic', learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)),
#     ('clf', LogisticRegression(C=200, n_jobs=-1, solver='saga', penalty='l1', class_weight={0: 1, 1: 10})),
#     ('clf', SVC(C=1.0))
#     ('clf', SGDClassifier(loss='log', penalty='elasticnet', alpha=1e-05, class_weight={0: 1, 1: 10}, n_jobs=-1))
])

In [58]:
y = train['Label']
X = train['Word']

In [59]:
# params = {
#     'clf__max_depth': [9, 11], 
#     'clf__min_child_weight': [45, 50, 55],
#     'clf__subsample': [0.97, 0.99, 1],
#     'clf__n_estimators': [20, 30],
#     'clf__colsample_bytree': [0.95, 0.97, 1]
# }

# params = {
#     'clf__loss': ['log'],
#     'clf__penalty': ['elasticnet'],
#     'clf__alpha': [1e-05],
#     'clf__l1_ratio': [0.15],
#     'clf__class_weight': [{0: 1, 1: 10}]
# }

# params = {
#     'clf__C': [1, 100, 1000],
#     'clf__penalty': ['l1', 'l2'],
#     'clf__solver': ['saga'],
#     'clf__class_weight': [{0: 1, 1: 10}],
#     'clf__n_jobs': [-1]
# }

# grid_search = GridSearchCV(pipeline, params, n_jobs=-1, verbose=1, scoring='roc_auc')
# grid_search.fit(X, y)
# print(grid_search.best_params_)
# print(grid_search.best_score_)

In [60]:
# cv = ShuffleSplit(n_splits=5, test_size=0.666, random_state=1159)
# scores = cross_val_score(pipeline, X, y, n_jobs=-1, scoring='roc_auc', cv=cv)
# print("ROC AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [61]:
pipeline.fit(X, y)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('n_grams', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0,...malizer(copy=True, norm='l2')), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [62]:
submission = pipeline.predict_proba(real_test)[:,1]

submission = pd.DataFrame(submission, columns=['Prediction']).reset_index()
submission.columns = ['Id', 'Prediction']
submission.to_csv("submission.csv", index=False)

In [63]:
# roc_auc_score(y, pipeline.predict_proba(X)[:,1])

In [64]:
# X_deferred = X_deferred['Word']
# y_deferred = pipeline.predict_proba(X_deferred)[:,1]

# roc_auc_score(y_deferred_true, y_deferred)