# Loading dependencies; defining functions

In [5]:
import pickle
import os.path
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import pandas as pd

def text_deserializer(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

def text_serializer(filename,text):
    with open(filename, 'wb') as file:
        pickle.dump(text, file)
        
from nltk.stem.snowball import RussianStemmer
stemmer = RussianStemmer()
stops = set(stopwords.words("russian"))

def words_and_chars(data):
    '''Depreceated'''
    special_marks = [re.findall("[,?.-]", data[y]) for y in range(len(data))]
    articles = [re.findall("[а-яА-Я]+", data[y]) for y in range(len(data))]
    stemmed_words = []
    for article in articles:
        words = [stemmer.stem(word) for word in article if word not in stops]
        stemmed_words.append(words)
    if len(special_marks) != len(stemmed_words):
        print('Length of two lists is not equal. Check the data!')
    return stemmed_words, special_marks
      

def filter_by_keywords(keywords, data, min_words=50, label=None):
    '''Iterate over given article, looking for match in defined keywords, returns articles with matched words'''
    dict_ret = {}
    for key,value in data.items():
        if (len(value) < min_words) or (value == '') or (value == ' '):
            continue
        else:
            for keyword in keywords:
                if keyword in value.lower():
                    dict_ret[key] = value
                else:
                    continue
    return {'url': [key for key in dict_ret.keys()], 'text': [text for text in dict_ret.values()], 'target': label}


In [None]:
#Loading data

fake_text = text_deserializer(os.path.join('raw_data', 'faketext_#807_aug_11_2017'))
ukr_text_war = text_deserializer(os.path.join('raw_data', 'unian_ukrpravda_war_#1554_aug_11_2017'))
ukr_text_minsk = text_deserializer(os.path.join('raw_data', 'unian_ukrpravda_minsk_#267_aug_11_2017'))
test_set = text_deserializer(os.path.join('raw_data', 'test_set'))
ukr_text = ukr_text_war.copy()
ukr_text.update(ukr_text_minsk)

# Filtering articles by keywords

In [None]:
keys = ['обсе', 'минский договор', 'нормандский', 'лнр', 'днр', 'плотницкий', 'захарченко', 'ополченцы', 'боевики','незалежная', 'народная республика', 'киев', 'силовики']

fake = filter_by_keywords(keys, fake_text, label=1)

non_fake = filter_by_keywords(keys, ukr_text, label=0)

print(len(fake['text']))
print(len(non_fake['text']))

# Structuring data

In [None]:
dataset_fake, dataset_non_fake  = pd.DataFrame(fake), pd.DataFrame(non_fake)

dataset = pd.concat([dataset_fake, dataset_non_fake])

dataset.sort_index(axis=1, inplace=True, ascending=False)

dataset['ru_text'] = dataset['text'].str.findall('[а-яА-Я]*').str.join(' ')

dataset['label'] = dataset['target'].apply(lambda x: 'Fake' if x == 1 else 'Not-Fake')

dataset['url_shorten'] = dataset['url'].str.extract('(^h\w*://\w{3,10}.\w{2,10}.)')

In [None]:
##Saving to Excel
fake_set = dataset[dataset['target'] == 1]

non_fake_set = dataset[dataset['target'] == 0]

fake_set.to_excel('Fakeset.xlsx')

non_fake_set.to_excel('Non-fakeset.xlsx')

In [None]:
##Saving Zik file to Excel

zik = pd.DataFrame(list(test_set.items()), columns=['url', 'text'])
zik.to_excel('cleaned_files/zik_news.xlsx')