In [2]:
import pathlib as pl
import numpy as np
import pandas as pd
import os

In [37]:
class TrainingData:
    def __init__(self, file_path: pl.Path) -> None:
        """Instantiate TrainingData class object."""
        self.df = self.get_training_data(file_path)
        self.rinse_training_data()
        self.add_labels()

    def get_training_data(self, file_path: pl.Path) -> pd.DataFrame:
        """Import and typecast training data from csv file."""
        df = pd.read_csv(file_path)
        df = df.drop(df.columns[0], axis=1)
        df["type"] = pd.Categorical(df.type)
        return df

    def rinse_training_data(self) -> None:
        """Rinse training data."""
        df = self.df[self.df['type'].notna()]
        self.df = df

    def add_labels(self) -> None:
        """Add custom labels based on 'type' column."""
        labels = {
            "unreliable": "unknown",
            "fake": "fake",
            "clickbait": "unknown",
            "conspiracy": "fake",
            "reliable": "real",
            "bias": "unknown",
            "hate": "unknown",
            "junksci": "fake",
            "political": "unknown",
            "unknown": "unknown"
        }
        def lookup_labels(data) -> pd.Categorical:
            return labels[data["type"]]
        self.df["labels"] = self.df.apply(lookup_labels, axis=1)



In [30]:
# parent = src/, parent.parent = fake-news/
# file_path = pl.Path(os.path.abspath('')).parent.resolve() / "data_files/corpus/reduced_corpus.csv"
file_path = pl.Path(os.path.abspath('')).parent.resolve() / "data_files/processed_csv/summarized_corpus.csv"

df = pd.read_csv(file_path)
df

Unnamed: 0,id,domain,type,url,scraped_at,title,authors,keywords,tags,summary,shortened,content_len,mean_word_len,median_word_len
0,34,beforeitsnews.com,fake,http://beforeitsnews.com/opinion-conservative/...,2018-01-25 16:17:44.789555,Surprise: Socialist Hotbed Of Venezuela Has Lo...,The Pirate'S Cove,,,,Headline: Bitcoin & Blockchain Searches Exceed...,1574,5.410256,4.0
1,35,beforeitsnews.com,fake,http://beforeitsnews.com/politics/2018/01/wate...,2018-01-25 16:17:44.789555,Water Cooler 1/25/18 Open Thread; Fake News ? ...,,,,,Water Cooler 1/25/18 Open Thread; Fake News ? ...,6647,5.242690,4.0
2,36,beforeitsnews.com,fake,http://beforeitsnews.com/politics/2018/01/vete...,2018-01-25 16:17:44.789555,Veteran Commentator Calls Out the Growing “Eth...,,,,,Veteran Commentator Calls Out the Growing “Eth...,3144,4.911824,4.0
3,37,beforeitsnews.com,fake,http://beforeitsnews.com/arts/2018/01/lost-wor...,2018-01-25 16:17:44.789555,"Lost Words, Hidden Words, Otters, Banks and Books",Jackie Morris Artist,,,,"Lost Words, Hidden Words, Otters, Banks and Bo...",3587,4.565147,4.0
4,38,beforeitsnews.com,fake,http://beforeitsnews.com/financial-markets/201...,2018-01-25 16:17:44.789555,Red Alert: Bond Yields Are SCREAMING “Inflatio...,Phoenix Capital Research,,,,Red Alert: Bond Yields Are SCREAMING “Inflatio...,637,5.617978,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,21612,beforeitsnews.com,fake,http://beforeitsnews.com/strange/2017/04/our-r...,2018-01-25 16:17:44.789555,Our “romances” with UFOs and the banality of b...,Ufo Iconoclast,,,,Our “romances” with UFOs and the banality of b...,2316,5.047619,4.0
9996,21613,beforeitsnews.com,fake,http://beforeitsnews.com/strange/2017/03/light...,2018-01-25 16:17:44.789555,Lights in the sky. (Give me a break!),Ufo Iconoclast,,,,Lights in the sky. (Give me a break!)\n\n% of ...,804,5.531532,4.0
9997,21614,beforeitsnews.com,fake,http://beforeitsnews.com/strange/2017/04/from-...,2018-01-25 16:17:44.789555,From my Facebook feed,Ufo Iconoclast,,,,How To Easily Understand The Difference Betwee...,102,4.150000,3.0
9998,21615,beforeitsnews.com,fake,http://beforeitsnews.com/strange/2017/04/a-bib...,2018-01-25 16:17:44.789555,"A Biblical miracle/myth or psychotic episodes,...",Ufo Iconoclast,,,,"A Biblical miracle/myth or psychotic episodes,...",2259,5.181287,4.0
