# Loading dependencies and data files

In [18]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import svm
import math
import re


fake, non_fake  = pd.read_excel('cleaned_files/Fakeset_cleaned_165.xlsx'), pd.read_excel('cleaned_files/Non-fakeset_cleaned_318.xlsx')

dataset = pd.concat([fake, non_fake])

pravda_unian_set = pd.read_excel('cleaned_files/Non-fakeset.xlsx')

testing_set = pravda_unian_set[(pravda_unian_set['url'].isin(dataset['url']) == False)]

dataset.reset_index(np.linspace(1,len(dataset)), inplace=True)

In [19]:
### Creating features with propaganda-sensitive words

pol_words = ['донецкая народная республика','ополченцы', 'фашисты', 'радикальной', 'незалежная', 'киевские власти', 'народный', 'савченко', 'народная', 'каратели', 'киев', 'ордо','террористы', 'боевики', 'гибридная', 'ато', 'незаконные вооруженные формирования', 'пророссийские боевики','бандформирования']

def political_words(row):
    data = row['text'].lower()
    words = pol_words
    for word in words:
        if word in data:
            row[word] = 1
        else:
            row[word] = 0
    return row

def build_log_scale(row):
    scale = pow(len(row['text']), 1/10)
    spmarks_ratio = len(re.findall('[,.?!:;"]', row['text']))
    score = math.log(spmarks_ratio, scale)
    return score

def build_digit_scale(row):
    scale = pow(len(row['text']), 1/10)
    digit_ratio = len(re.findall('(\d{1,2})+', row['text']))
    if digit_ratio == 0:
        score = 1
    else:
        score = math.log(digit_ratio, scale)
    return score

dataset = dataset.apply(political_words, axis=1)

dataset['special_marks'] = dataset.apply(build_log_scale, axis=1)
dataset['digit_freq'] = dataset.apply(build_digit_scale, axis=1)


# Model training 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:,'донецкая народная республика': 'digit_freq'], 
                                                    dataset['target'], 
                                                    random_state=0)

model = svm.SVC(kernel='linear', C=100).fit(X_train, y_train)

predictions = model.predict(X_test)

auc = roc_auc_score(y_test, predictions)

accuracy = model.score(X_test, y_test)

# Testing 

In [22]:
testing_set = testing_set.apply(political_words, axis=1)
testing_set['special_marks'] = testing_set.apply(build_log_scale, axis=1)
testing_set['digit_freq'] = testing_set.apply(build_digit_scale, axis=1)

res = model.predict(test.loc[:,'special_marks': 'бандформирования'])

'''By using only matrix of politically sensitive words and additional features, \n
the accuracy and auc score are much below the vectorizered model. However, the model does not overfitt data and perform
extremely well on third party set'''
print({'accuracy': accuracy, 'auc': auc, 'test_set_accuracy': (1-np.count_nonzero(res)/len(res))})


{'accuracy': 0.85950413223140498, 'auc': 0.83594771241830057, 'test_set_accuracy': 1.0}
