In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [None]:
from bs4 import BeautifulSoup

In [None]:
df_sample = pd.read_csv('sampleSubmission.csv', sep='\t', encoding='utf8')
df_train = pd.read_csv('train.csv', sep='\t', encoding='utf8')
df_test = pd.read_csv('test.csv', sep='\t', encoding='utf8')
df_other = pd.read_csv('other.csv', sep='\t', encoding='utf8')

In [None]:
def _soup (s):
    return BeautifulSoup(s, "lxml").get_text()

In [None]:
df_test['description'] = df_test['description'].map(lambda s: BeautifulSoup(s, "lxml").get_text())
df_train['description'] = df_train['description'].map(lambda s: BeautifulSoup(s, "lxml").get_text())

In [None]:
#Сохраним обработанные файлы, чтобы нам не делать эту процедуру каждый раз
df_test.to_csv('test_cleaned.csv', sep=',', encoding='utf8',index = False)
df_train.to_csv('train_cleaned.csv', sep=',', encoding='utf8',index = False)

In [None]:
df_other['description'] = df_other['description'].map(lambda s: BeautifulSoup(s, "lxml").get_text())

In [None]:
df_other.to_csv('other_cleaned.csv', sep=',', encoding='utf8',index = False)

In [2]:
# эта часть загружает предобработанные файлы
df_sample = pd.read_csv('sampleSubmission.csv', sep=',', encoding='utf8')
df_train = pd.read_csv('train_cleaned.csv', sep=',', encoding='utf8')
df_test = pd.read_csv('test_cleaned.csv', sep=',', encoding='utf8')
df_other = pd.read_csv('other_cleaned.csv', sep=',', encoding='utf8')

In [None]:
# для экспериментов возьмем небольшие объемы, например, первые 500 резюме

In [None]:
#df_train_sample_1 = df_train_1.iloc[:500,:]
#df_train_sample_0 = df_train_0.iloc[:500,:]

In [3]:
import re
from collections import Counter
import pymorphy2
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
morph=pymorphy2.MorphAnalyzer() 
posConv={'ADJF':'_ADJ','NOUN':'_NOUN','VERB':'_VERB'}
meaningfullPoSes=['ADJF', 'NOUN', 'VERB']

def getArticleDictionary(text, needPos=None):
    words=[a[0] for a in re.findall("([А-ЯЁа-яё]+(-[А-ЯЁа-яё]+)*)", text)]
    reswords=[]

    for w in words:
        wordform=morph.parse(w)[0]
        if wordform.tag.POS in meaningfullPoSes:
            if needPos!=None:
                reswords.append(wordform.normal_form+posConv[wordform.tag.POS])
            else:
                reswords.append(wordform.normal_form)
            
    return Counter(reswords)

In [5]:
def getMeaningfullWords(text):
    words=[]
    tokens=re.findall('[А-Яа-яЁё]+\-[А-Яа-яЁё]+|[А-Яа-яЁё]+', text)
    for t in tokens:
        pv=morph.parse(t)
        for p in pv:
            if p.tag.POS in ['ADJF', 'NOUN', 'VERB']:
                words.append(p.normal_form)
                break
    return words

In [6]:
def cosineSimilarity(a, b):
    if len(a.keys())==0 or len(b.keys())==0:
        return 0
    sumab=sum([a[na]*b[na] for na in a.keys() if na in b.keys()])
    suma2=sum([a[na]*a[na] for na in a.keys()])
    sumb2=sum([b[nb]*b[nb] for nb in b.keys()])
    return sumab/math.sqrt(suma2*sumb2)

In [7]:
df_train['description'] = df_train['name'] + " " + df_train['description']
df_train_1 = df_train[df_train['target']==1]
df_train_0 = df_train[df_train['target']==0]

In [None]:
df_train.head(2)

In [None]:
#создадим 2 словаря - однин, который соответствует результату 1, другой - результату 0

In [8]:
train_text_1 = ' '.join([df_train_1['name'][i] for i in (df_train_1.index)])

In [9]:
train_text_0 = ' '.join([df_train_0['name'][i] for i in (df_train_0.index)])

In [13]:
traindict_1 = getArticleDictionary(train_text_1)

In [14]:
traindict_0 = getArticleDictionary(train_text_0)

In [15]:
df_train['cos_1'] = \
    df_train['name'].map(lambda s: cosineSimilarity(traindict_1, getArticleDictionary(s)))

In [16]:
df_train['cos_0'] = \
    df_train['name'].map(lambda s: cosineSimilarity(traindict_0, getArticleDictionary(s)))

In [17]:
df_test['cos_1'] = \
    df_test['name'].map(lambda s: cosineSimilarity(traindict_1, getArticleDictionary(s)))

In [18]:
df_test['cos_0'] = \
    df_test['name'].map(lambda s: cosineSimilarity(traindict_0, getArticleDictionary(s)))

In [19]:
#чтобы все не потерялось, запишем аккуратно все в файлы
df_train.to_csv('train_cosine.csv', sep=',', encoding='utf8',index = False)
df_test.to_csv('test_cosine.csv', sep=',', encoding='utf8',index = False)

In [20]:
df_train.head(10)

Unnamed: 0,id,name,description,target,cos_1,cos_0
0,0,Заведующий отделом/секцией в магазин YORK (Уру...,Заведующий отделом/секцией в магазин YORK (Уру...,1,0.129839,0.076643
1,1,Наладчик станков и манипуляторов с ПУ,Наладчик станков и манипуляторов с ПУ Обязанно...,0,0.000106,0.0171
2,2,Разработчик С++ (Криптограф),Разработчик С++ (Криптограф) Требования: Опыт...,0,2.9e-05,0.099462
3,3,Фрезеровщик,Фрезеровщик Условия: На работу вахтовым метод...,0,2e-05,0.019226
4,4,Мерчендайзер/продавец-консультант,Мерчендайзер/продавец-консультант Компания Пал...,1,0.233848,0.012823
5,5,Мастер по эксплуатации зданий,Мастер по эксплуатации зданий Обязанности: Об...,0,0.000897,0.02577
6,6,Торговый представитель,Торговый представитель НА СТАБИЛЬНОЕ И РАСПРОС...,1,0.298861,0.00711
7,7,Торговый представитель,Торговый представитель Обязанности: Функциона...,1,0.298861,0.00711
8,8,Менеджер по продажам,Менеджер по продажам Обязанности: Продажа сад...,1,0.81473,0.181911
9,9,Менеджер по работе с ключевыми клиентами (POSM),Менеджер по работе с ключевыми клиентами (POSM...,0,0.482146,0.131515


In [None]:
#### дальше посмотрим на модели ###

In [23]:
X = df_train.drop(['id','name', 'description','target'], axis = 1).values
y = df_train['target'].values

In [31]:
df_train.pivot_table(values = ['cos_1', 'cos_0'],index = 'target', aggfunc = 'mean')

Unnamed: 0_level_0,cos_0,cos_1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.1867,0.068212
1,0.101496,0.305163


In [33]:
# разделим наши данные на тест и тренировочную выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [34]:
X_train

array([[ 0.00931075,  0.07488956],
       [ 0.8147303 ,  0.18191138],
       [ 0.00151261,  0.        ],
       ..., 
       [ 0.24424295,  0.005955  ],
       [ 0.12869489,  0.35198012],
       [ 0.18930107,  0.00727499]])

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [37]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

In [38]:
y_pred_proba[:,1]

array([ 0.96999115,  0.05406682,  0.7312179 , ...,  0.33073279,
        0.33073279,  0.81829721])

In [39]:
roc_auc_score(y_test, y_pred_proba[:,1])

0.88162699050028026

In [40]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

In [41]:
roc_auc_score(y_test, y_pred_proba[:,1])

0.94625314493737567

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
classifier = RandomForestClassifier(n_estimators = 100)

In [62]:
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

In [63]:
roc_auc_score(y_test, y_pred_proba[:,1])

0.98194942837186761

In [66]:
X_pred = df_test.drop(['id','name', 'description'], axis = 1).values

In [67]:
y_hat_proba = classifier.predict_proba(X_pred)
y_hat = classifier.predict(X_pred)

In [68]:
df_test.columns

Index(['id', 'name', 'description', 'cos_1', 'cos_0'], dtype='object')

In [69]:
df_results = df_test.drop(['name', 'description', 'cos_1', 'cos_0'], axis = 1)

In [None]:
#df_tmp = df_log_pred.copy()

In [None]:
#def sorter(s, n):
#    if s>=n:
#        return 1
#    else:
#        return 0

In [None]:
#df_tmp['target'] = df_tmp['cosine_name'].map(lambda s: sorter(s, 0.2))

In [None]:
#df_tmp.drop(['name','cosine_name'], axis = 1).to_csv('submission1.csv', sep=',', encoding='utf8',index = False)

In [70]:
df_results = pd.concat([df_results,pd.DataFrame(y_hat ,columns = ['target'])],axis = 1, names = ['id','target'])

In [71]:
df_results.to_csv('submission_forest.csv', sep=',', encoding='utf8',index = False)

In [72]:
df_results.head()

Unnamed: 0,id,target
0,200000,1
1,200001,1
2,200002,1
3,200003,1
4,200004,0
