In [None]:
import math
import re
import pandas as pd
import numpy as np
import pymorphy2
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import WhitespaceTokenizer
from string import punctuation
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import preprocessing


def merge_com(data):
    data = data.copy()
    for i in range(data.shape[0]):
        data.loc[i, 'com_len'] = len(data.loc[i, 'comment'])
        
        try:
            data.loc[i, 'comPos_len'] = len(data.loc[i, 'commentPositive'])
        except TypeError:
            data.loc[i, 'comPos_len'] = 0
            
        try:
            data.loc[i, 'comNeg_len'] = len(data.loc[i, 'commentNegative'])
        except TypeError:
            data.loc[i, 'comNeg_len'] = 0
            
        data.loc[i, 'nCapital'] = len(re.findall("[A-Z]", data.loc[i, 'comment']))
        
        try:
            data.loc[i, 'nCapitalPos'] = len(re.findall("[A-Z]", data.loc[i, 'commentPositive']))
        except TypeError:
            data.loc[i, 'nCapitalPos'] = 0
            
        try:
            data.loc[i, 'nCapitalNeg'] = len(re.findall("[A-Z]", data.loc[i, 'commentNegative']))
        except TypeError:
            data.loc[i, 'nCapitalNeg'] = 0
        
        try:
            if math.isnan(data.commentNegative[i]):
                data.loc[i, 'hasNegComment'] = 0
        except TypeError:
            data.loc[i, 'comment'] = data.loc[i, 'comment'] + ' ' + data.loc[i, 'commentNegative']
            data.loc[i, 'hasNegComment'] = 1
        
        try:
            if math.isnan(data.commentPositive[i]):
                data.loc[i, 'hasPosComment'] = 0
        except TypeError:
            data.loc[i, 'comment'] = data.comment[i] + ' ' + data.commentPositive[i]
            data.loc[i, 'hasPosComment'] = 1
        
        data.loc[i, 'n_of_exc'] = data.loc[i, 'comment'].count('!')

    return data.drop(['commentNegative', 'commentPositive'], axis=1)

morph = MorphAnalyzer()
mystopwords = list(set(stopwords.words('russian')) - set(['много','без','никогда' , 'совсем' , 'не', 'нет', 'более','больше',  'ничего', "но", "хорошо", "лучше"])) + ['это',"х","р", "тыс", "тыщ", "руб"] 

def parse_sentence(sent):
    sent = sent.replace('...', ' ')
    sent = sent.replace('-', ' ')
    sent = sent.replace('/', ' ')
    exclude = set(punctuation + '0123456789'+u'–—'+u'«»')
    merged_text = ''.join(ch for ch in sent if ch not in exclude)
    tokens = WhitespaceTokenizer().tokenize(merged_text.lower())
    tokens = [t_ for t_ in tokens if t_ not in mystopwords]
    tokens = [morph.parse(word)[0].normal_form for word in tokens]
    return ' '.join(ch for ch in tokens)


print('Preparing dataset\n')
df = pd.read_csv('X_train.csv').drop(['userName', 'property', 'date'], axis=1)
df = merge_com(df)
df['parsed'] = df.comment.apply(parse_sentence)
df = df.drop(['comment'], axis=1)
df.reting = df.reting.apply(round)

print('Splitting dataset into train and test\n')
X_train, X_test, y_train, y_test = train_test_split(df.drop(['reting'], axis=1), 
                                                    df['reting'], test_size=0.33)

tfidf = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train['parsed'])

tfidf_train = tfidf.transform(X_train['parsed'])
tfidf_test = tfidf.transform(X_test['parsed'])

print('Training model...\n')
clf = SVC(C = 1.0, kernel = 'linear').fit(tfidf_train, np.array(y_train))
print('Score:')
print(clf.score(tfidf_test, np.array(y_test)))

Preparing dataset

