In [1]:
import pandas as pd
import numpy as np
import pymorphy2
import nltk
import stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, StratifiedKFold, ParameterGrid
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from datetime import date
import fastnumbers
import re
from fastnumbers import isfloat, isint

In [3]:
data = pd.read_csv('../data/train_set.csv', usecols=range(1,11),  parse_dates=['timestamp', 'thread_timestamp'])

In [4]:
#select top 12 channels
data =data[data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers', 
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) 
           & data.main_msg]

users_100 = list(data.user_id.value_counts()[:100].index)
data = data[data["user_id"].isin(users_100)]

mappings = {}
for c, value in enumerate(users_100 , 1):
    mappings[value] = c

In [5]:
# split on data and data val
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

train_data = train[['user_id', 'text']].reset_index()[['user_id', 'text']]
train_data['user_id'] = train_data.user_id.map(mappings)
train_data = train_data.sort_values('user_id').reset_index()[['user_id', 'text']]

val_data = val[['user_id', 'text']].reset_index()[['user_id', 'text']]
val_data['user_id'] = val_data.user_id.map(mappings)
val_data = val_data.sort_values('user_id').reset_index()[['user_id', 'text']]

train_data.text = train_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))
train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

val_data.text = val_data.text.astype(str)\
    .apply(lambda x: re.sub('(<\S+>:?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels =  np.asarray(train_data['user_id'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['user_id'], dtype='int8')

In [11]:
# without tuning accuracy_score = 15.34%
# tuning accuracy_score = 19.85%

classifier = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, min_df= 1, ngram_range= (1, 2), max_features= 100000,)),
    ('clf', OneVsRestClassifier(LogisticRegression()))])

In [12]:
classifier.fit(train_text, train_labels)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))])

In [13]:
predicted = classifier.predict(val_text)

In [14]:
accuracy_score(predicted, val_labels)

0.19856926078473877

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

In [14]:
vocabulary = set()
for index, sentence in train_text.iteritems():
    for word in nltk.word_tokenize(sentence):
        vocabulary.add(word.encode().decode('utf-8', 'ignore'))

In [16]:
with open('../data/users_classification.vocab', 'w') as f:
    for line in vocabulary:
        f.write(line+'\n')