In [1]:
def get_lemma_dict(path="data/lemma_dict.txt"):
    lemma_dict = dict()
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            l = line.split()
            lemma_dict[l[0]] = l[1]
    return lemma_dict

def get_stop_words(path="data/stop_words_mini.txt"):
    stop_words = set()
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            if line[0] != '*':
                stop_words.add(line.strip())
    return stop_words

def get_ascii(word):
    l = "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя-\'"
    s = "!?.;\"'/\\,;()"
    new = ""
    for w in word:
        if w in l:
            new += w
        elif w in s and (new and new[-1] != ' '):
            new += " "
    return new

def get_lemma_word(word, use_stop_words=True):
    new_word = get_ascii(word.lower().strip())
    words = [x.strip() for x in new_word.split()]
    if len(words) <= 1:
        if new_word and new_word in lemma_dict:
            if not use_stop_words or new_word not in stop_words:
                return [lemma_dict[new_word]]
    else:
        res = []
        for word in words:
            if word and word in lemma_dict:
                if not use_stop_words or new_word not in stop_words: 
                    res.append(lemma_dict[word])
        return res
    return [""]

def get_lemma_par(par):
    new = []
    for sent in par.split('.'):
        sent = get_lemma_sent(sent)
        if sent: 
            new.append(sent)
    return new

# Main
def get_lemma_sent(sent, use_stop_words=True):
    new = []
   
    for word in sent.split():
        word = get_lemma_word(word, use_stop_words=use_stop_words)
        if word and word != [""]: 
            for w in word:
                new.append(w)
    return new

In [2]:
def get_antonyms_dict(path="data/antonyms.txt"):
    out = dict()
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            row = line.split(",")
            key, value = row[0], row[1]
            key, value = key.strip(), value.strip()
            out[key] = value
    return out
                
def process_antonyms(sent):
    out = []
    next_skip = False
    for i in range(len(sent) - 1):
        if next_skip:
            next_skip = False
            continue
        if sent[i] in opposite_dict:
            if sent[i + 1] in antonyms_dict:
                sent[i + 1] = antonyms_dict[sent[i + 1]]
            else:
                next_skip = True
        else:
            out.append(sent[i])
    if len(sent) < 1: return None
    if not next_skip: out.append(sent[-1])
    return out

In [3]:
opposite_dict = {'ні', 'не', 'без'}
lemma_dict = get_lemma_dict()
antonyms_dict = get_antonyms_dict()
stop_words = get_stop_words()

In [4]:
def process_sent(sent, use_antonyms=True, use_stop_words=True):
    sent = get_lemma_sent(sent, use_stop_words=use_stop_words)
    if use_antonyms:
        sent = process_antonyms(sent)
    if sent is None or len(sent) < 1: return None
    return sent

### Test data processing

In [5]:
sent = "Я була сьогодні не п'яна"
sent = process_sent(sent, use_antonyms=True, use_stop_words=True)
print(sent)

['сьогодні', 'тверезий']


### Model

In [6]:
limit = 70

In [7]:
import warnings
warnings.filterwarnings('ignore')
import gensim
import sys
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import re
import pandas_profiling

In [8]:
data_folder = os.path.join('data', 'ubercorpus.lowercased.lemmatized.word2vec.300d')

In [9]:
%%time
model = gensim.models.KeyedVectors.load_word2vec_format(data_folder, binary=False)
model.init_sims(replace=True)

CPU times: user 1min 4s, sys: 1.16 s, total: 1min 5s
Wall time: 1min 5s


In [10]:
df = pd.read_csv("data/train.csv")
df["text"] = df["text"].astype(str)
df["length"] = df["text"].apply(lambda x: len(x.split(" ")))
df.head()

Unnamed: 0,tone,text,length
0,0,препарат его первий не не не оно же состав фор...,69
1,-1,сайт служба контроль наркотик от заперти ораль...,20
2,-1,служба контроль наркотик об том сайт один орал...,14
3,-1,сайт служба контроль наркотик от заперти мл ам...,19
4,-1,розпорядження надходження інформація правоохор...,151


In [11]:
print(df.shape)

(69983, 3)


In [12]:
df = df[df["length"] < limit]

In [13]:
def word_to_vect(word):
    try:
        return model.wv[word]
    except KeyError:
        return None

In [14]:
def string_to_vects(data, max_len=limit, **kwargs):
    data = process_sent(data, **kwargs)
    out = []
    if not data: return
    for word in data:
        vect = word_to_vect(word)
        if vect is not None:
            out.append(vect)
    if out is None: return
    to_add = max_len - len(out)
    out += [[0 for _ in range(300)] for _ in range(to_add)]
    out = np.array(out)
    out = out.reshape(-1)
    return out

In [15]:
def df_to_vects(data, max_len=limit,**kwargs):
    """
    df
    """
    X = []
    y = []
    for idx, row in df.iterrows():
        new_row = string_to_vects(row["text"], **kwargs)
        if new_row is not None:
            X.append(new_row)
            y.append(row["tone"])
    if len(X) < 1: return
    X = np.array(X)
    y = np.array(y)
    return X, y

In [16]:
def sentence_reader(regr, message):
    vect = string_to_vects(message)
    return regr.predict_proba([vect])


In [17]:
### Best C = 0.1

In [18]:
C = [0.01, 0.05, 0.1, 0.5]

In [19]:
df.sample(frac=1)
X, y = df_to_vects(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



In [20]:
#Handle memory
del(df)


In [21]:
params = {
    "solver": ["saga"],
    "multi_class": ["multinomial"],
    "max_iter": [20],
    "C": [0.1]
}

In [22]:
from sklearn.model_selection import GridSearchCV
regr = LogisticRegression()

In [None]:
best_regr = GridSearchCV(regr, params, cv=4, n_jobs=8, verbose=1)
best_regr.fit(X_train, y_train)
best_regr.get_params()

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   4 out of   4 | elapsed: 13.5min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   4 out of   4 | elapsed: 13.5min finished


### Own check

In [None]:
# Here fit with best C

In [None]:
print("Final score:", best_regr.score(X_test, y_test))

In [None]:
sentence_reader(best_regr, "Це було класно")

In [None]:
!ls