# Подготовка данных

In [25]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
import random
import copy

In [26]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

In [27]:
datapath = "data_twitter/"
def parse_csv_twitter(filename, fileout):
    pos_df = pd.read_csv(datapath+filename, sep=";", header=None)
    txts_df = list(pos_df[3])
    
    texts_revised = []
    for text in txts_df:
        words = tknzr.tokenize(text)
        words_new = list(filter(lambda word: not word.startswith('http') and not word.startswith('#'), words))
        if len(words_new) == 0:
            continue
        else:
            texts_revised.append(' '.join(words_new).replace('RT :', '').replace('"', ''))
    with open(datapath+fileout, 'w', encoding='UTF-8') as f:
        for s in texts_revised:
            f.write(s.lower())
            f.write('\n')
    f.close()

In [28]:
parse_csv_twitter("positive.csv", "pos.txt")

In [29]:
parse_csv_twitter("negative.csv", "neg.txt")

In [30]:
def load_sentences(file_path):
    with open(file_path, encoding='UTF-8') as f:
        sent = [line.strip('\n') for line in f]
    return sent

In [31]:
def save_sentences(list_sent, file_path):
    with open(file_path, 'w', encoding='UTF-8') as f:
        for s in list_sent:
            f.write(s + '\n')
    f.close() 

In [32]:
def split_dataset(sentences, temp_name, tr, d):
    sent_temp = copy.deepcopy(sentences)
    random.shuffle(sent_temp)
    train = sent_temp[:int(tr*(len(sent_temp)))]
    dev = sent_temp[int(tr*(len(sent_temp))):int(tr*(len(sent_temp)))+int(d*(len(sent_temp)))]
    test = sent_temp[int(tr*(len(sent_temp)))+int(d*(len(sent_temp))):]
    
    save_sentences(train, datapath+temp_name+'.train')
    save_sentences(dev, datapath+temp_name+'.dev')
    save_sentences(test, datapath+temp_name+'.test')
    

In [33]:
from nltk.tokenize import word_tokenize

In [34]:
pos_sent = load_sentences(datapath+"pos.txt")

In [36]:
pos_sent = list(filter(lambda x: x != '?' and x != '.' and x != '', pos_sent))

In [39]:
neg_sent = load_sentences(datapath+"neg.txt")
neg_sent = list(filter(lambda x: x != '?' and x != '.' and x != '', neg_sent))

In [40]:
split_dataset(neg_sent, 'twitter_neg', 0.7, 0.2)

In [38]:
split_dataset(pos_sent, 'twitter_pos', 0.7, 0.2)

# Построение классификатора fasttext

In [14]:
import fasttext

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
data_path = 'data_twitter/'

In [41]:
def load_sentences(filename):

    sentences = []
    with open(filename, encoding='UTF-8') as f:
        for line in f:
            sentences.append(line.strip('\n'))
    f.close()
    return sentences

In [42]:
pos_s = load_sentences('data_twitter/pos.txt')
neg_s = load_sentences('data_twitter/neg.txt')

In [43]:
X = []
y = []


for s in pos_s:
    X.append(s)
    y.append('__label__pos')
for s in neg_s:
    X.append(s)
    y.append('__label__neg')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [45]:
with open('data_twitter/fasttext_train_twitter.txt', 'w', encoding='UTF-8') as f_train:
    for x, y in zip(X_train, y_train):
        line = y + ' ' + x + '\n'
        f_train.write(line)
f_train.close()

with open('data_twitter/fasttext_test_twitter.txt', 'w', encoding='UTF-8') as f_test:
    for x, y in zip(X_test, y_test):
        line = y + ' ' + x + '\n'
        f_test.write(line)
f_test.close()

In [47]:
param_dict = {
    'lr' : [0.01, 0.05, 0.1, 0.5],
    'epoch' : [25, 50, 100],
    'wordNgrams': [2,3],
    'dim': [10, 50, 100, 256]
}

In [48]:
def test_param(train_file, test_file, param_dict):
    max_prec = 0
    max_lr = 0
    max_epoch = 0
    max_wng = 0
    max_dim = 0
    best_model = ''
    for lrate in param_dict['lr']:
        for epoch_num in param_dict['epoch']:
            for wng in param_dict['wordNgrams']:
                for dim_n in param_dict['dim']:
                    model1 = fasttext.train_supervised(input=train_file,
                                                       lr=lrate,
                                                       epoch=epoch_num,
                                                       wordNgrams=wng,
                                                       bucket=200000,
                                                       dim=dim_n)
                    
                    _, prec, rec = model1.test(test_file)
                    print('lr={}, epoch={}, wordNgrams={}, dim={}, precision={}, recall={}'.format(lrate,
                                                                                                   epoch_num,
                                                                                                   wng,
                                                                                                   dim_n,
                                                                                                   prec,
                                                                                                   rec))
                    if max_prec < prec:
                        max_prec = prec
                        max_lr = lrate
                        max_epoch = epoch_num
                        max_wng = wng
                        max_dim = dim_n
                        best_model = model1
    print('Best parameters: lr={}, epoch={}, wordNgrams={}, dim={}'.format(max_lr,
                                                                           max_epoch,
                                                                           max_wng,
                                                                           max_dim))
    return best_model

In [49]:
best_model = test_param(data_path+'fasttext_train_twitter.txt', data_path+'fasttext_test_twitter.txt', param_dict)

lr=0.01, epoch=25, wordNgrams=2, dim=10, precision=0.9990302816590999, recall=0.9990302816590999
lr=0.01, epoch=25, wordNgrams=2, dim=50, precision=0.9990743597655045, recall=0.9990743597655045
lr=0.01, epoch=25, wordNgrams=2, dim=100, precision=0.9990302816590999, recall=0.9990302816590999
lr=0.01, epoch=25, wordNgrams=2, dim=256, precision=0.9990743597655045, recall=0.9990743597655045
lr=0.01, epoch=25, wordNgrams=3, dim=10, precision=0.9987658130206726, recall=0.9987658130206726
lr=0.01, epoch=25, wordNgrams=3, dim=50, precision=0.9988098911270772, recall=0.9988098911270772
lr=0.01, epoch=25, wordNgrams=3, dim=100, precision=0.9988539692334817, recall=0.9988539692334817
lr=0.01, epoch=25, wordNgrams=3, dim=256, precision=0.9988980473398863, recall=0.9988980473398863
lr=0.01, epoch=50, wordNgrams=2, dim=10, precision=0.9991625159783136, recall=0.9991625159783136
lr=0.01, epoch=50, wordNgrams=2, dim=50, precision=0.9992506721911226, recall=0.9992506721911226
lr=0.01, epoch=50, wordNgr

lr=0.5, epoch=50, wordNgrams=3, dim=50, precision=0.9989421254462908, recall=0.9989421254462908
lr=0.5, epoch=50, wordNgrams=3, dim=100, precision=0.9988539692334817, recall=0.9988539692334817
lr=0.5, epoch=50, wordNgrams=3, dim=256, precision=0.9990302816590999, recall=0.9990302816590999
lr=0.5, epoch=100, wordNgrams=2, dim=10, precision=0.9992065940847181, recall=0.9992065940847181
lr=0.5, epoch=100, wordNgrams=2, dim=50, precision=0.999118437871909, recall=0.999118437871909
lr=0.5, epoch=100, wordNgrams=2, dim=100, precision=0.9992506721911226, recall=0.9992506721911226
lr=0.5, epoch=100, wordNgrams=2, dim=256, precision=0.9992506721911226, recall=0.9992506721911226
lr=0.5, epoch=100, wordNgrams=3, dim=10, precision=0.9989862035526954, recall=0.9989862035526954
lr=0.5, epoch=100, wordNgrams=3, dim=50, precision=0.9989421254462908, recall=0.9989421254462908
lr=0.5, epoch=100, wordNgrams=3, dim=100, precision=0.9990743597655045, recall=0.9990743597655045
lr=0.5, epoch=100, wordNgrams=

lr=0.05, epoch=50, wordNgrams=2, dim=100, precision=0.99920673394738, recall=0.99920673394738


In [50]:
best_model = fasttext.train_supervised(input=data_path+'fasttext_train_twitter.txt',
                                   lr=0.05,
                                   epoch=50,
                                   wordNgrams=2,
                                   bucket=200000,
                                   dim=100)

In [51]:
best_model.test(data_path+'fasttext_test_twitter.txt')

(22687, 0.9992506721911226, 0.9992506721911226)

In [52]:
best_model.save_model("fasttext_twitter.bin")