In [0]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

##Загружаем данные

In [42]:
train = pd.read_csv('./linear_train.txt', header=None, names=['word', 'target'])
test = pd.read_csv('./linear_test.txt', header=None, names=['word'])
sample = pd.read_csv('./linear_ans_example.txt')
all_data = pd.concat([train , test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [3]:
sample.head()

Unnamed: 0,Id,Answer
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


## Признаки

In [0]:
vowels = 'аоэиуыеёюя'
consonant = 'бвгджзйклмнпрстфхцчшщьъ'
alphabet = set(vowels) | set(consonant)


def vowels_count(word):
    cnt = 0
    for i in word.lower():
        if i in vowels:
          cnt += 1
    return cnt
  
  
def consonant_count(word):
    return len(word) - vowels_count(word)

  
def divide_vow_by_cons(x):
    return vowels_count(x) / (consonant_count(x) + 1e-3)

  
def all_symbols(words):
    res = set()
    for w in words:
        res.update(list(w.lower()))
    return res
  
  
bad_symbols = all_symbols(all_data.word.values) ^ alphabet
  
  
def bad_count(word):
    cnt = 0
    for i in word.lower():
        if i in bad_symbols:
            cnt += 1
    return cnt

In [0]:
func = [str.isupper, str.istitle, len, vowels_count, consonant_count, divide_vow_by_cons, bad_count]
columns = ['isupper', 'istitle', 'len', 'vowels', 'consonant', 'divide_vow_by_cons', 'bad_count']
for f, col in zip(func, columns):
    all_data[col] = all_data['word'].apply(f)
all_data['is_duplicate'] = all_data['word'].duplicated(keep=False)

In [45]:
all_data.head()

Unnamed: 0,target,word,isupper,istitle,len,vowels,consonant,divide_vow_by_cons,bad_count,is_duplicate
0,1.0,Аалтонен,False,True,8,4,4,0.99975,0,False
1,0.0,Аар,False,True,3,2,1,1.998002,0,False
2,0.0,Аарон,False,True,5,3,2,1.49925,0,True
3,0.0,ААРОН,True,False,5,3,2,1.49925,0,False
4,0.0,Аарона,False,True,6,4,2,1.999,0,True


In [46]:
def validate(x, y):
    model = LogisticRegression()
    score = cross_val_score(model, x, y, scoring='roc_auc', cv=10)
    print(score.mean(), score.std() , '\n')

new_train = all_data[all_data['target'].notnull()]
validate(new_train.drop(['target', 'word' ], axis=1), new_train['target'])



0.8632292658463273 0.020836241924790274 



## pymorphy

In [48]:
!pip install pymorphy2



In [0]:
from sklearn.preprocessing import LabelEncoder
import pymorphy2

In [0]:
morph = pymorphy2.MorphAnalyzer()

In [0]:
all_data['pm'] = all_data['word'].apply(lambda x: morph.tag(x)[0])
all_data['pm_animacy'] = all_data['pm'].apply(lambda x: x.animacy)
all_data['pm_POS'] = all_data['pm'].apply(lambda x: x.POS)
all_data['pm_case'] = all_data['pm'].apply(lambda x: x.case)
all_data['pm_number'] = all_data['pm'].apply(lambda x: x.number)
all_data['pm_gender'] = all_data['pm'].apply(lambda x: x.gender)

In [0]:
cat_features = ['pm', 'pm_animacy', 'pm_POS', 'pm_case', 'pm_number', 'pm_gender']

for col in cat_features:
    all_data[col] = LabelEncoder().fit_transform(list(all_data[col].fillna('nan')))

In [26]:
new_train = all_data[all_data['target'].notnull()]
validate(new_train.drop(['target', 'word' ], axis=1), new_train['target'])



0.8939123687913136 0.015942569532318044 



## TF-IDF

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [0]:
bigram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,10), max_features = 100, encoding='utf8')
for_tf_idf = bigram_vectorizer.fit_transform(all_data.word.apply(str.lower).tolist())
tfidf_sparse = TfidfTransformer().fit_transform(for_tf_idf)

In [0]:
tfidf = pd.DataFrame(tfidf_sparse.toarray(), index=all_data.index, columns=['tf_idf_' + str(i) for i in range(tfidf_sparse.shape[1])])

In [0]:
tfidf = pd.concat([all_data, tfidf], axis=1)

In [0]:
def validate(x , y):
    model = XGBClassifier(max_depth = 10, n_estimators=670, learning_rate=0.09, colsample_bytree=0.9, colsample_bylevel=0.6)
    cv = StratifiedKFold(4, shuffle=True, random_state=99)
    score = cross_val_score(model, x, y, scoring='roc_auc', cv=cv)
    print(score.mean(), score.std(), '\n')

In [58]:
new_tfidf = tfidf[tfidf['target'].notnull()]
validate(new_tfidf.drop(['target', 'word' ], axis=1), new_train['target'])

0.9591002034967829 0.00127796518267774 



In [0]:
new_train = tfidf[tfidf['target'].notnull()]
new_test = tfidf[tfidf['target'].isnull()]

model = XGBClassifier(max_depth = 10, n_estimators=670, learning_rate=0.09, colsample_bytree=0.9, colsample_bylevel=0.6)
model.fit(new_train.drop(['target', 'word'] , axis=1) , new_train['target'])
sample['Answer'] = model.predict_proba(new_test.drop(['word', 'target'] , axis=1))[:,1]

sample.to_csv('./submit.csv', index=False)