<a href="https://colab.research.google.com/github/tmc2/Projeto_estatistica_2020.1/blob/master/1_Data_Aquisition_%26_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture

%cd ..
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json

# Data Acquisition and Preparation

In [None]:
RAW_DATA_FOLDER = Path('data/raw/')
INTERMEDIATE_DATA_FOLDER = Path('data/interim/')
REFERENCE_FOLDER = Path('references/')

## Downloading Data

In [None]:
# TODO: Data Acquisition

## Preparing Dataset

In [None]:
FAKE_DATA_FOLDER = RAW_DATA_FOLDER / 'fake'
TRUE_DATA_FOLDER = RAW_DATA_FOLDER / 'true'
FAKE_META_FOLDER = RAW_DATA_FOLDER / 'fake-meta-information'
TRUE_META_FOLDER = RAW_DATA_FOLDER / 'true-meta-information'

### Text datasets

In [None]:
def create_text_dataframe(folder):
    df_dict = {}
    for filepath in folder.glob("*.txt"):
        with open(filepath, 'r', encoding='utf-8') as f:
            df_dict[filepath.stem] = f.read() 
    return pd.DataFrame.from_dict(df_dict, orient='index', columns=['text'])

In [None]:
fake_text_df = create_text_dataframe(FAKE_DATA_FOLDER)
true_text_df = create_text_dataframe(TRUE_DATA_FOLDER)

### Metadata Datasets

In [None]:
def create_metadata_datasets(folder, metadata_columns, metadata_dtypes):
    df_dict = {}
    df_dict = {k:[] for k in metadata_columns}
    df_dict["index"] = []
    
    for filepath in list(folder.glob("*.txt")):
        with open(filepath, 'r') as f:    
            df_dict["index"].append(filepath.stem.split("-")[0])
            for col, value in zip(metadata_columns, f.readlines()):
                df_dict[col].append(value[0:-1])
    
    df = pd.DataFrame(df_dict)
    df = df.replace("None", np.nan)
    df = df.astype(metadata_dtypes, errors='ignore').set_index("index", drop=True)
    df.index.name = None
    
    return df

In [None]:
metadata_columns = [
    "author", "link", "category", "date_of_publication",
    "tokens", "words_no_punctuation", "types", "links_inside", 
    "upper_words", "verbs", "subjuntive_imperative_verbs",
    "nouns", "adjectives", "adverbs", "modal_verbs", 
    "singular_first_second_personal_pronouns",
    "plural_first_personal_pronouns", "pronouns",
    "pausality", "characters", "average_sentence_length",
    "average_word_lenght", "percentage_spelling_errors",
    "emotiveness", "diversity"
]

metadata_translate = [
    "author", "link", "category", "date of publication", "number of tokens",
    "number of words without punctuation", "number of types",
    "number of links inside the news", "number of words in upper case",
    "number of verbs", "number of subjuntive and imperative verbs",
    "number of nouns", "number of adjectives", "number of adverbs",
    "number of modal verbs (mainly auxiliary verbs)",
    "number of singular first and second personal pronouns",
    "number of plural first personal pronouns", "number of pronouns",
    "pausality", "number of characters", "average sentence length",
    "average word length", "percentage of news with speeling errors",
    "emotiveness", "diversity"
]

metadata_dtypes = {
    "author": "string", "link": "string", "category": "string",
    "date_of_publication": "datetime64[ns]",
    "tokens": "float", "words_no_punctuation": "float",
    "types": "float","links_inside": "float", "upper_words": "float",
    "verbs": "float", "subjuntive_imperative_verbs": "float", "nouns": "float", 
    "adjectives": "float", "adverbs": "float","modal_verbs": "float", 
    "singular_first_second_personal_pronouns": "float",
    "plural_first_personal_pronouns": "float", "pronouns": "float","characters": "float",
    "pausality": "float", "average_sentence_length": "float",
    "average_word_lenght": "float", "percentage_spelling_errors": "float",
    "emotiveness": "float", "diversity": "float"
}

In [None]:
fake_metadata_df = create_metadata_datasets(FAKE_META_FOLDER, metadata_columns, metadata_dtypes)
true_metadata_df = create_metadata_datasets(TRUE_META_FOLDER, metadata_columns, metadata_dtypes)

In [None]:
fake_metadata_df.links_inside.unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  9., 11.,  6.,  8.])

In [None]:
fake_metadata_df.links_inside.isna().sum()

0

In [None]:
true_metadata_df.loc[["69", "61"]]

In [None]:
true_text_df.loc[['68']].text

## Merging Created Datasets

### Fake Dataset

In [None]:
fake_df = pd.concat([fake_text_df, fake_metadata_df], axis=1, sort=False)
fake_df.index = fake_df.index.astype(int)
fake_df = fake_df.sort_index()
fake_df = fake_df.reset_index(drop=True)

### True Dataset

In [None]:
true_df = pd.concat([true_text_df, true_metadata_df], axis=1, sort=False)
true_df.index = true_df.index.astype(int)
true_df = true_df.sort_index()
true_df = true_df.reset_index(drop=True)

## Merge All Datasets

In [None]:
result = pd.concat([true_df, fake_df], keys=['True', 'Fake'])
result = result.reset_index(level=0).rename(columns={"level_0": "class"})

result.to_csv(INTERMEDIATE_DATA_FOLDER/"fake_true_news.csv", index=False)

## Columns Information

In [None]:
columns_info ={}
columns_info['text'] = 'Text extracted from the news'
for var, desc in zip(metadata_columns, metadata_translate):
    columns_info[var] = desc

In [None]:
with open(REFERENCE_FOLDER / "news_data_dictionary.json","w") as f:
    f.write(json.dumps(columns_info))
    f.close()