In [2]:
import pathlib as pl
import numpy as np
import pandas as pd
import os

In [37]:
class TrainingData:
    def __init__(self, file_path: pl.Path) -> None:
        """Instantiate TrainingData class object."""
        self.df = self.get_training_data(file_path)
        self.rinse_training_data()
        self.add_labels()

    def get_training_data(self, file_path: pl.Path) -> pd.DataFrame:
        """Import and typecast training data from csv file."""
        df = pd.read_csv(file_path)
        df = df.drop(df.columns[0], axis=1)
        df["type"] = pd.Categorical(df.type)
        return df

    def rinse_training_data(self) -> None:
        """Rinse training data."""
        df = self.df[self.df['type'].notna()]
        self.df = df

    def add_labels(self) -> None:
        """Add custom labels based on 'type' column."""
        labels = {
            "unreliable": "unknown",
            "fake": "fake",
            "clickbait": "unknown",
            "conspiracy": "fake",
            "reliable": "real",
            "bias": "unknown",
            "hate": "unknown",
            "junksci": "fake",
            "political": "unknown",
            "unknown": "unknown"
        }
        def lookup_labels(data) -> pd.Categorical:
            return labels[data["type"]]
        self.df["labels"] = self.df.apply(lookup_labels, axis=1)



In [13]:
# parent = src/, parent.parent = fake-news/
# file_path = pl.Path(os.path.abspath('')).parent.resolve() / "data_files/corpus/reduced_corpus.csv"
file_path = pl.Path(os.path.abspath('')).parent.resolve() / "data_files/processed_csv/summarized_corpus_valset2.csv"
# file_path = pl.Path(os.path.abspath('')).parent.resolve() / "data_files/processed_csv/shortened_corpus_valset2.csv"

df = pd.read_csv(file_path)
df

Unnamed: 0,id,domain,type,url,scraped_at,title,authors,keywords,tags,summary,words,content_len,mean_word_len,median_word_len,split
0,34,beforeitsnews.com,fake,http://beforeitsnews.com/opinion-conservative/...,2018-01-25 16:17:44.789555,Surprise: Socialist Hotbed Of Venezuela Has Lo...,The Pirate'S Cove,,,,{},1574,5.253112,4.0,3
1,35,beforeitsnews.com,fake,http://beforeitsnews.com/politics/2018/01/wate...,2018-01-25 16:17:44.789555,Water Cooler 1/25/18 Open Thread; Fake News ? ...,,,,,{},6647,5.152299,4.0,4
2,36,beforeitsnews.com,fake,http://beforeitsnews.com/politics/2018/01/vete...,2018-01-25 16:17:44.789555,Veteran Commentator Calls Out the Growing “Eth...,,,,,{},3144,4.731660,4.0,7
3,37,beforeitsnews.com,fake,http://beforeitsnews.com/arts/2018/01/lost-wor...,2018-01-25 16:17:44.789555,"Lost Words, Hidden Words, Otters, Banks and Books",Jackie Morris Artist,,,,{},3587,4.386541,4.0,8
4,39,beforeitsnews.com,fake,http://beforeitsnews.com/environment/2018/01/s...,2018-01-25 16:17:44.789555,Scientists move Doomsday Clock ahead by 30 sec...,Desdemona Despair,,,,{},17970,5.359592,5.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,592,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,2018-01-25 16:17:44.789555,Boss offers his staff TWO DAYS off if they sup...,,,,,{},370,4.492063,4.0,6
73,593,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,2018-01-25 16:17:44.789555,Alabama declares flu outbreak emergency as vac...,,,"vaccine failure, influenza, Heavy metals, Flu ...",,{},2549,5.732044,5.0,4
74,594,christianpost.com,reliable,https://www.christianpost.com/news/hindus-burn...,2018-01-25 16:17:44.789555,Hindus Burn Down Church in India Alleging 'For...,,,,,{},4878,4.952321,4.0,6
75,595,christianpost.com,reliable,https://www.christianpost.com/news/dont-let-po...,2018-01-25 16:17:44.789555,Don't Let Political Identity Trump Your Identi...,,,,,{},4643,4.782497,4.0,9


In [1]:
#from data_importer.py import TrainingData

In [2]:
import pandas as pd
import numpy as np

# Generate some example data
n_rows = 5
fake_data = np.random.randint(0, 10, size=n_rows)
real_data = np.random.randint(0, 10, size=n_rows)

# Create the DataFrame
df = pd.DataFrame({'fake': fake_data, 'real': real_data})


df['freq'] = df['fake'] + df['real'] 

for i in range(len(df)):
    print(df.iloc[i])
    


fake    0
real    2
freq    2
Name: 0, dtype: int32
fake     9
real     5
freq    14
Name: 1, dtype: int32
fake     8
real     8
freq    16
Name: 2, dtype: int32
fake    0
real    4
freq    4
Name: 3, dtype: int32
fake     5
real     7
freq    12
Name: 4, dtype: int32


In [5]:
import pathlib as pl
import numpy as np
import pandas as pd
import json

def json_to_pd(file_path : str = "data_files/words/included_words.json") -> pd.DataFrame:
    """Take a json file location as argument and convert it to a pandas dataframe.
     The dataframe is filtered to only show the columns: word, fake, real.
     
     - Argument: File location is relative to the fake-news folder"""

    # file reference for dataframe
    json_file_path = pl.Path(__file__).resolve().parent.parent.parent / file_path

    # creating dataframe by reading json file directly
    df = pd.read_json(json_file_path, orient="index")

    # filtering for fake and reliable and replacing NaN with [0,0]
    df = df.filter(items=['fake', 'reliable'], axis=1)
    df = df.rename(columns={'reliable':'real'})
    df = df.applymap(lambda x: [0,0] if x is np.nan else x)
    df['freq'] = df['fake'] + df['real'] # adding new column with sum of fake and real
    print(df.head())
    return df

json_to_pd()

NameError: name '__file__' is not defined