# read data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp read_data

In [None]:
#| export
from pathlib import Path
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## create test data

This repository only contains a sample of the original data because of size constraints.
Below, we create a sample of the original data.

In [None]:
#| notest
#| eval: false

from tqdm.notebook import tqdm

src_dir = Path('../../data')
dest_dir = Path('../data_test')

n = 100

for subdir in src_dir.rglob('*'):
    if subdir.is_dir():
        dest_subdir = dest_dir / subdir.relative_to(src_dir)
        dest_subdir.mkdir(parents=True, exist_ok=True)
        for file_path in tqdm(list(subdir.glob('*.csv'))):
            df = pd.read_csv(file_path, on_bad_lines='skip', engine='python')
            df_sample = df.sample(n=min(n, len(df)), random_state=58)
            sample_file_path = dest_subdir / file_path.name
            df_sample.to_csv(sample_file_path, index=False)

  0%|          | 0/96 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

## read data

### get file paths

In [None]:
#| export
def get_fpaths_year(year: str, dir='../data_test/years'): 
	dir_path = Path(dir)
	return list(dir_path.glob(f'{year}*.csv'))

In [None]:
get_fpaths_year('2020')

[PosixPath('../data_test/years/2020-04-14_21:20:57___2020-04-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-04-07_21:19:06___2020-04-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-14_21:19:36___2020-06-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-10-14_21:19:48___2020-10-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-07_21:19:08___2020-06-07_21:59:59.csv'),
 PosixPath('../data_test/years/2020-02-07_22:18:35___2020-02-07_22:59:59.csv'),
 PosixPath('../data_test/years/2020-07-14_21:22:47___2020-07-14_21:59:59.csv'),
 PosixPath('../data_test/years/2020-06-01_21:59:59___2020-06-01_21:22:24.csv'),
 PosixPath('../data_test/years/2020-09-19_21:14:30___2020-09-19_21:59:59.csv'),
 PosixPath('../data_test/years/2020-07-01_21:59:59___2020-07-01_21:23:38.csv'),
 PosixPath('../data_test/years/2020-10-19_14:18:54___2020-10-19_14:58:31.csv'),
 PosixPath('../data_test/years/2020-02-01_22:59:59___2020-02-01_22:07:59.csv'),
 PosixPath('../data_test/years/2020-08-1

In [None]:
#| export
def get_fpaths_subreddit(subreddit: str, dir='../data_test/subreddits'): 
	dir_path = Path(dir)
	return list(dir_path.glob(f'{subreddit}*.csv'))

In [None]:
get_fpaths_subreddit('conspiracy')

[PosixPath('../data_test/subreddits/conspiracy___2020-11-17_11:02:26___2020-11-27_22:59:54.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-03-01_23:00:02___2020-03-09_22:59:59.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-01-03_19:39:57___2020-01-27_22:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-04-17_04:25:29___2020-04-27_21:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-05-14_00:35:50___2020-05-27_21:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-02-06_03:54:59___2020-02-27_22:59:57.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-07-01_22:00:04___2020-07-09_21:59:58.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-11-01_23:00:04___2020-11-09_22:59:56.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-02-01_23:00:04___2020-02-09_22:59:55.csv'),
 PosixPath('../data_test/subreddits/conspiracy___2020-08-01_22:00:01___2020-08-09_21:59:56.csv'),
 PosixPath('../data_

### read comments

#### read a single `csv` file of comments

In [None]:
fpath = get_fpaths_year('2019')[0]

In [None]:
#| export
def read_one_comments_csv(fpath):
    try:
        comments = pd.read_csv(
            fpath,
            usecols=['id', 'created_utc', 'author', 'subreddit', 'body'],
            dtype={
                'id': 'string',
                'author': 'string',
                'subreddit': 'string',
                'body': 'string'
            },
            parse_dates=['created_utc'],
            low_memory=False,
            lineterminator='\n'
        )
        comments_clean = comments\
            .dropna()\
            .drop_duplicates(subset='id')
        return comments_clean
    except FileNotFoundError:
        print(f'{fpath} not found on disk')
    except pd.errors.EmptyDataError:
        print(f'{fpath} is empty')

In [None]:
read_one_comments_csv(fpath)

Unnamed: 0,author,body,created_utc,id,subreddit
0,lilfooty,This'll hurt them more than the loss,2019-05-07 21:55:57,emrz5jp,soccer
1,Kaeneko,"I loved vampires *so* much, lol. Always fantas...",2019-05-07 21:34:12,emrx5eq,BDSMcommunity
2,Les_Deplorables,Poor Zombies gonna starve. No Brains!,2019-05-07 21:21:11,emrvxjq,The_Donald
3,viper2544,No one is going to mention the $12 shipping?,2019-05-07 21:56:45,emrz8g7,legostarwars
4,ninjasquirrelarmy,Agreed. I showed my stylist the Phoenix hair ...,2019-05-07 21:34:43,emrx730,Instagramreality
...,...,...,...,...,...
95,kleptominotaur,Is tj still a muscle pharm dude? or parm? what...,2019-05-07 21:35:05,emrx88m,MMA
96,bonesstackedonfloor,Fidgeting,2019-05-07 21:45:27,emry5y5,AskReddit
97,Perfectoi,Imagine thinking EV will be sacked. Friendly r...,2019-05-07 21:18:06,emrvmx0,Barca
98,BB-Zwei,And Dumbo.,2019-05-07 21:51:35,emryq8t,movies


#### read multiple `csv` files with comments

In [None]:
#| export
def read_multi_comments_csvs(fpaths: list):
    comments_lst = []
    for fpath in fpaths:
        comments = read_one_comments_csv(fpath)
        comments_lst.append(comments)
    comments_concat = pd.concat(
        comments_lst,
        axis=0,
        ignore_index=True
    )
    return comments_concat

In [None]:
fpaths = get_fpaths_year('2019')

In [None]:
read_multi_comments_csvs(fpaths)

Unnamed: 0,author,body,created_utc,id,subreddit
0,lilfooty,This'll hurt them more than the loss,2019-05-07 21:55:57,emrz5jp,soccer
1,Kaeneko,"I loved vampires *so* much, lol. Always fantas...",2019-05-07 21:34:12,emrx5eq,BDSMcommunity
2,Les_Deplorables,Poor Zombies gonna starve. No Brains!,2019-05-07 21:21:11,emrvxjq,The_Donald
3,viper2544,No one is going to mention the $12 shipping?,2019-05-07 21:56:45,emrz8g7,legostarwars
4,ninjasquirrelarmy,Agreed. I showed my stylist the Phoenix hair ...,2019-05-07 21:34:43,emrx730,Instagramreality
...,...,...,...,...,...
4794,m00sedad,Donald Fucking Trump,2019-06-19 21:12:28,erl5gls,AskReddit
4795,Abramabundiz,"obviously the office or parks, or maybe a spin...",2019-06-19 21:35:15,erl7gic,AskReddit
4796,StarrySkye3,That sounds like someone who argues that other...,2019-06-19 21:33:57,erl7ccj,otherkin
4797,mostoriginalusername,I hadn't heard about that one. :),2019-06-19 21:41:22,erl7zzj,catsareliquid
