In [0]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

In [0]:
# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

In [0]:
# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
print('Files in Drive:')
!ls drive/

In [0]:
# https://github.com/dmlc/xgboost
# This specific version is a work-around for a build issue in newer versions.
!pip install -q xgboost==0.4a30
import xgboost

In [113]:
!pip install pymorphy2
!pip install pymystem3
!pip install natasha

Collecting pymystem3
  Downloading pymystem3-0.1.9.tar.gz
Building wheels for collected packages: pymystem3
  Running setup.py bdist_wheel for pymystem3 ... [?25l- \ | / - done
[?25h  Stored in directory: /content/.cache/pip/wheels/3e/10/f5/e54d2cc166cfa6f36ab46004a11d9a346635af69e1ac05cba7
Successfully built pymystem3
Installing collected packages: pymystem3
Successfully installed pymystem3-0.1.9
Collecting natasha
  Downloading natasha-0.8.0-py2.py3-none-any.whl (795kB)
[K    100% |████████████████████████████████| 798kB 1.3MB/s 
[?25hCollecting yargy (from natasha)
  Downloading yargy-0.9.0-py2.py3-none-any.whl
Collecting intervaltree==2.1.0 (from yargy->natasha)
  Downloading intervaltree-2.1.0.tar.gz
Collecting jellyfish==0.5.6 (from yargy->natasha)
  Downloading jellyfish-0.5.6.tar.gz (132kB)
[K    100% |████████████████████████████████| 133kB 4.1MB/s 
[?25hCollecting backports.functools-lru-cache==1.3 (from yargy->natasha)
  Downloading backports.functools_lru_ca

In [0]:
import xgboost as xgb
from xgboost import plot_importance

import numpy as np
import pandas as pd
import scipy

from sklearn.preprocessing import StandardScaler, LabelEncoder

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from scipy.sparse import hstack
import pymorphy2

import pymystem3
from natasha import NamesExtractor

from sklearn.utils import resample

from google.colab import files

In [10]:
df = pd.read_csv('drive/DMIA/train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101408 entries, 0 to 101407
Data columns (total 2 columns):
Word     101408 non-null object
Label    101408 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [0]:
################

################

In [0]:
df_train, df_val = train_test_split(df, test_size=0.1) # UPSAMPLING

words_train = df_train['Word']
y_train = df_train['Label'].as_matrix()

words_val = df_val['Word']
y_val = df_val['Label'].as_matrix()

In [54]:
df_train['Label'].value_counts()

0    81676
1     9591
Name: Label, dtype: int64

In [97]:
# UPSAMPLE

df_majority = df_train[df_train.Label==0]
df_minority = df_train[df_train.Label==1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=81679,    # to match majority class
                                 random_state=128) # reproducible results
 
# Combine majority class with upsampled minority class
# df_train = pd.concat([df_majority, df_minority_upsampled])

print(df_train['Label'].value_counts())

0    81712
1    81679
Name: Label, dtype: int64


In [0]:
morph = pymorphy2.analyzer.MorphAnalyzer()

def tfidf(vectorizer, words, train=True):
  if train:
    vectorized = vectorizer.fit_transform(words)
  else:
    vectorized = vectorizer.transform(words)
  return vectorized

def morph_is_surname(word):
  return 'Surn' in ' '.join(' '.join(x.grammemes) for x in morph.tag(word))

def hand_feats(word, include_endings=True):
  mystem = pymystem3.Mystem()
  def name_from_pymystem(word):
      try:
          return 1 if 'имя' in mystem.analyze(word)[0]['analysis'][0]['gr'].split(',') else 0
      except:
          return 0
  def surn_from_pymystem(word):
      try:
          return 1 if 'фам' in mystem.analyze(word)[0]['analysis'][0]['gr'].split(',') else 0
      except:
          return 0
  func = NamesExtractor()
  def function_natasha(word):
      return 1 if func(word) else 0

  all_data = pd.DataFrame()
  
  all_data['natasha_person'] = word.apply(function_natasha)
  all_data['name_from_pymystem'] = word.apply(name_from_pymystem)
  all_data['surn_from_pymystem'] = word.apply(surn_from_pymystem)
  
  all_data['pymorphy'] = word.apply(lambda x: morph.tag(x)[0])
  all_data['pymorphy_animacy'] = all_data['pymorphy'].apply(lambda x: x.animacy)
  all_data['pymorphy_POS'] = all_data['pymorphy'].apply(lambda x: x.POS)
  all_data['pymorphy_case'] = all_data['pymorphy'].apply(lambda x: x.case)
  all_data['pymorphy_number'] = all_data['pymorphy'].apply(lambda x: x.number)
  all_data['pymorphy_gender'] = all_data['pymorphy'].apply(lambda x: x.gender)

  all_data.drop('pymorphy' , axis=1 , inplace=True)
  columns_to_one_hot = ['pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']
  for col in columns_to_one_hot:
      all_data[col] = LabelEncoder().fit_transform(list(all_data[col].fillna('nan')))
  
  morph_surname = word.map(morph_is_surname).as_matrix()
  first_capital = word.map(lambda x: len(x) > 1 and str.isupper(x[0])).as_matrix()
  first_capital_second_not = word.map(
      lambda x: len(x) > 2 and str.isupper(x[0]) and str.islower(x[1])).as_matrix()
  capslock = word.map(lambda x: str.isupper(x)).as_matrix()
  wordlen = word.map(lambda x: len(x)).as_matrix()
  isalpha = word.map(lambda x: str.isalpha(x)).as_matrix()
  upperinmiddle = word.map(lambda x: np.any([c.isupper() for c in x[1:]])).as_matrix()
  islower = word.map(lambda x: str.islower(x)).as_matrix().reshape(-1,1)
  
  if include_endings:
    ending_vectorizer = TfidfVectorizer(lowercase=False, analyzer='char', ngram_range=[1,7], strip_accents=None)
    ending = ending_vectorizer.fit_transform(word.map(lambda x: x[-4:]))
    return hstack([
      morph_surname.reshape(-1,1),
      first_capital.reshape(-1,1), 
      first_capital_second_not.reshape(-1,1), 
      capslock.reshape(-1,1), 
      wordlen.reshape(-1,1), 
      isalpha.reshape(-1,1), 
      ending,
      upperinmiddle.reshape(-1,1),
      all_data.as_matrix(),
        islower,
    ], format='csr')
  else:
    return np.hstack([
      morph_surname.reshape(-1,1),
      first_capital.reshape(-1,1), 
      first_capital_second_not.reshape(-1,1), 
      capslock.reshape(-1,1), 
      wordlen.reshape(-1,1), 
      isalpha.reshape(-1,1),
      upperinmiddle.reshape(-1,1),
      all_data.as_matrix(),
        islower,
    ])

In [0]:
##########

def prepare_X(words, y=None):
  features = hand_feats(words)
  X = scipy.sparse.csr_matrix(features, dtype='float64')
  return X

def report_train(label, model, X, y):
    preds = model.predict_proba(X)[:,1]
    print('{} ROC AUC:'.format(label), roc_auc_score(y, preds))

In [117]:
vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(1,7), strip_accents=None, analyzer='char', max_features=80000)
X_train_lr = tfidf(vectorizer, words_train, train=True)
X_val_lr = tfidf(vectorizer, words_val, train=False)

X_train_lr = hstack([
  X_train_lr,
  hand_feats(words_train, include_endings=False)
], format='csr')

X_val_lr = hstack([
  X_val_lr,
  hand_feats(words_val, include_endings=False)
], format='csr')

scaler = StandardScaler(with_mean=False)
X_train_lr = scaler.fit_transform(X_train_lr)
X_val_lr = scaler.transform(X_val_lr)

logreg = LogisticRegression(penalty='l2',
                            dual=False,
                            tol=0.0001,
                            C=3e-5,
                            verbose=True,
                            class_weight='balanced'
                           )
logreg.fit(X_train_lr, y_train)

report_train('train', logreg, X_train_lr, y_train)
report_train('val', logreg, X_val_lr, y_val)

Installing mystem to /content/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz


[LibLinear]train ROC AUC: 0.9895271115026035
val ROC AUC: 0.9092130499889828


In [118]:
vectorizer1 = TfidfVectorizer(preprocessor=stemmer.stem ,lowercase=False, ngram_range=(1,7), strip_accents='unicode', analyzer='char', max_features=80000)
X_train_lr1 = tfidf(vectorizer1, words_train, train=True)
X_val_lr1 = tfidf(vectorizer1, words_val, train=False)

X_train_lr1 = hstack([
  X_train_lr1,
  hand_feats(words_train, include_endings=False)
], format='csr')

X_val_lr1 = hstack([
  X_val_lr1,
  hand_feats(words_val, include_endings=False)
], format='csr')

scaler1 = StandardScaler(with_mean=False)
X_train_lr1 = scaler1.fit_transform(X_train_lr1)
X_val_lr1 = scaler1.transform(X_val_lr1)

logreg1 = LogisticRegression(penalty='l2',
                            dual=False,
                            tol=0.0001,
                            C=3e-5,
                            verbose=True,
                            class_weight='balanced'
                           )
logreg1.fit(X_train_lr1, y_train)

report_train('train', logreg1, X_train_lr1, y_train)
report_train('val', logreg1, X_val_lr1, y_val)

[LibLinear]train ROC AUC: 0.9815800806591636
val ROC AUC: 0.9056907805118944


In [145]:

X_train = hand_feats(words_train, include_endings=False)
X_val = hand_feats(words_val, include_endings=False)

boost1 = xgb.XGBClassifier(silent=False, max_depth=10, learning_rate=0.09, n_estimators=670, colsample_bytree=0.9 , colsample_bylevel=0.6)
boost1.fit(X_train, y_train)

report_train('train', boost1, X_train, y_train)
report_train('val', boost1, X_val, y_val)

train ROC AUC: 0.943569920646248
val ROC AUC: 0.9243623691345133


In [120]:
mean_pred_train = (logreg.predict_proba(X_train_lr)[:,1] + 
                   boost1.predict_proba(X_train)[:,1] +
                   logreg1.predict_proba(X_train_lr1)[:,1]
                  )/3

mean_pred_val = (logreg.predict_proba(X_val_lr)[:,1] + 
                   boost1.predict_proba(X_val)[:,1] +
                   logreg1.predict_proba(X_val_lr1)[:,1]
                  )/3

print(roc_auc_score(y_train, mean_pred_train ))
print(roc_auc_score(y_val, mean_pred_val ))

0.9911318459801921
0.9119613801221624


In [0]:
df_val['Prediction'] = boost1.predict_proba(X_val)[:,1]

In [0]:
df_val.sample(5)

In [0]:
fig, ax = plt.subplots(1,1, figsize=[12,32])
plot_importance(boost1, ax=ax)

# Test

In [0]:
test = pd.read_csv('drive/DMIA/test.csv')
test_words = test['Word']

In [0]:

# X_test_lr = hstack([
#   tfidf(vectorizer, test_words, train=False),
#   hand_feats(test_words, include_endings=False)
# ], format='csr')
X_test = hand_feats(test_words)
# X_test_lr1 = hstack([
#   tfidf(vectorizer1, test_words, train=False),
#   hand_feats(test_words, include_endings=False)
# ], format='csr')

# X_test_lr = scaler.transform(X_test_lr)
# X_test_lr1 = scaler1.transform(X_test_lr1)

# mean_pred_test = (logreg.predict_proba(X_test_lr)[:,1] + 
#                    boost1.predict_proba(X_test)[:,1] +
#                    logreg1.predict_proba(X_test_lr1)[:,1]
#                   )/3

In [0]:
test['Prediction'] = boost1.predict_proba(X_test)[:,1] #mean_pred_test

In [152]:
test.sample(10)

Unnamed: 0,Word,Prediction
152500,сроки,0.024763
176745,хотений,0.002236
124611,Примитив,0.024965
126601,ПРОИЗВЕДЕНИЕ,0.00162
168433,Усмирение,0.0868
98602,ОБОРОТЕ,0.004672
177405,Хэддена,0.02566
180380,числительными,0.0005
183540,штрафах,0.002236
188330,яичко,0.024763


In [0]:
test.drop(['Word'], axis=1, inplace=True)
test.index.rename("Id", inplace=True)
test.to_csv('drive/DMIA/sub.csv')

# files.download('drive/DMIA/sub.csv')