In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import pymorphy2
import re
import numpy as np

In [None]:
train_data = pd.read_csv('data/train.csv', sep='\t', index_col='id')

## Описание полей
 * __id__ &mdash; внутренний идетификатор
 * __name__ &mdash; название вакансии
 * __description__ &mdash; текст вакансии
 * __target__ &mdash; класс заинтересованности

In [None]:
train_data.head()

In [None]:
posConv={'ADJF':'_ADJ','NOUN':'_NOUN','VERB':'_VERB', 'PRTF': '_PRT', 'GRND': '_GRND'}

def getVacancyVector(text, needPos=None):
    text = BeautifulSoup(text, 'lxml').get_text()
    
    words=[a[0] for a in re.findall("([А-ЯЁа-яё]+(-[А-ЯЁа-яё]+)*)", text)]
    reswords=[]

    for w in words:
        wordform=morph.parse(w)[0]
        if wordform.tag.POS in ['ADJF', 'NOUN', 'VERB', 'PRTF', 'GRND']:
            if needPos!=None:
                reswords.append(wordform.normal_form+posConv[wordform.tag.POS])
            else:
                reswords.append(wordform.normal_form)
            
    return reswords  


In [None]:
morph=pymorphy2.MorphAnalyzer() 

Для начала обработаем все записи и сохраним обработанную таблицу, что бы в будущем обращатся к размеченным данным.

In [None]:
%%time

train_data.description = train_data.apply(lambda row: getVacancyVector(row['description'], True), axis=1)
train_data.name = train_data.apply(lambda row: getVacancyVector(row['name'], True), axis=1)

## Начинаем непосредственно обучение

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

### CountVectorizer + TF_IDF + LogisticClassifier без лемматизации

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [5]:
train = pd.read_csv('data/train.csv', sep='\t', index_col='id')
test = pd.read_csv('data/test.csv', sep='\t', index_col='id')

In [6]:
test_array = np.array(test['name']+' '+test['description'])


In [44]:
train_array = np.array(train['name']+' '+train['description'])
train_array.shape

(200000,)

In [125]:
texts = []
for t in np.hstack((train_array, test_array)):    
    texts.append(
        BeautifulSoup(t, 'lxml').text
    )

In [126]:
y = train.iloc[:, -1].values

In [128]:
cv = CountVectorizer(max_df=0.8, min_df=3)

In [131]:
temp_matrix = cv.fit_transform(texts)

In [132]:
tfidf_transformer = TfidfTransformer(norm=None, smooth_idf=False)
matrix_tfidf = tfidf_transformer.fit_transform(temp_matrix)

In [133]:
classifier = LogisticRegression(random_state=123, n_jobs=2)

In [134]:
%%time 

classifier.fit(matrix_tfidf[:train_array.shape[0]], y)

  " = {}.".format(self.n_jobs))


Wall time: 2min 39s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [135]:
%%time 
predictions = classifier.predict(matrix_tfidf[train_array.shape[0]:])

Wall time: 577 ms


In [136]:
result1 = pd.DataFrame(predictions, columns=['target'], index=np.arange(200000, 370179))

In [140]:
result1.head(10)

Unnamed: 0,target
200000,1
200001,1
200002,1
200003,1
200004,0
200005,0
200006,1
200007,0
200008,0
200009,0


In [138]:
result1.to_csv('data/result1.csv', index_label='id')

In [None]:
from sklearn.cross_validation import cross_val_score
cross_val_score(
    classifier,
    matrix_tfidf[:train_array.shape[0]],
    y,
    scoring='roc_auc',
    cv=5
)

In [None]:
rfc = RandomForestClassifier()

In [None]:
all_texts = np.hstack((train_texts, test_texts))
                      
                      
model = Pipeline (
    [
        ('cv', CountVectorizer()),
        ('tfidf', TfidfTransformer(norm=None, smooth_idf=False)),
        ('classifier', LogisticRegression(random_state=123))
    ]
)


model.fit(X, y)

In [None]:
from gensim.models.word2vec import Word2Vec # Собственно модель.
from gensim.models.word2vec import LineSentence # Выравнивание текста по предложениям.
from gensim.models import KeyedVectors # Семантические вектора.