In [None]:
# default_exp read_data
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Reading the data

> This notebook covers reading the Reddit data.

# Variables

In [None]:
SUBREDDIT = 'askreddit'
LIMIT = 100_000
YEARS = [year for year in range(2006, 2021)]
YEAR = 2010

## Imports

In [None]:
# export
from glob import glob
import pandas as pd
from pathlib import Path

## `Google Cloud Storage` authentication

## Get file paths

### per lexeme

In [None]:
#export
def get_fpaths_lex(LEX, CORPUS_DIR='data/', source='local', bucket_name='socemb'):
    if source == 'remote':
        client = storage.Client()
        blobs = [blob for blob in client.list_blobs(bucket_name, prefix=f'comments/{LEX}')]
        fpaths = [f'gs://{bucket_name}/{blob.name}' for blob in blobs]
    if source == 'local':
        lex_path = f'{CORPUS_DIR}{LEX}' + "/*.csv"
        fpaths = glob(lex_path)
    return fpaths    

In [None]:
fpaths_lex = get_fpaths_lex('Anglo-Saxon', source='local')

In [None]:
fpaths_lex

['data/Anglo-Saxon/Anglo-Saxon_2006-02-04--2020-08-13.csv',
 'data/Anglo-Saxon/Anglo-Saxon_2019-12-31--2020-10-06.csv']

In [None]:
assert len(fpaths_lex) == 2

### per subreddit (and year)

In [None]:
# export
def get_fpath_subr_yr(SUBREDDIT, LIMIT, YEAR):
    return f'data/subreddit/{SUBREDDIT}/{LIMIT}_{YEAR}.csv'

In [None]:
get_fpath_subr_yr('askreddit', 100_000, 2009) == 'data/subreddit/askreddit/100000_2009.csv'

True

In [None]:
# export
def get_fpaths_subr_yrs(SUBREDDIT, LIMIT, YEARS):
    fpaths = [get_fpath_subr_yr(SUBREDDIT, LIMIT, year) for year in YEARS]
    return fpaths

In [None]:
assert len(get_fpaths_subr_yrs(SUBREDDIT, LIMIT, YEARS)) == 14

### per year

In [None]:
# export
def get_fpaths_yr(YEAR, DIR='data/subreddit/'):
    fpaths = []
    for fpath in Path(DIR).rglob(f'*{YEAR}.csv'):
        fpaths.append(fpath)
    return fpaths

In [None]:
read_comm_csvs(fpaths)

data/subreddit/asklibertarians/100000_2010.csv is empty
data/subreddit/askaconservative/100000_2010.csv is empty


Unnamed: 0,body,created_utc,id,subreddit
0,"That's funny, I only remember hearing that fro...",2010-12-30 22:59:53,c1aw4fb,politics
1,&gt; You are completely wrong or just flat out...,2010-12-30 22:59:43,c1aw4ek,politics
2,[deleted],2010-12-30 22:59:37,c1aw4eb,politics
3,"Also, those in the highest tax bracket have th...",2010-12-30 22:59:34,c1aw4e7,politics
4,It is if it's free. You just target people who...,2010-12-30 22:59:33,c1aw4e5,politics
...,...,...,...,...
211095,Blankets + pets.,2010-12-27 20:42:21,c1ai194,AskReddit
211096,good question,2010-12-27 20:42:20,c1ai193,AskReddit
211097,"it's probably a crime in your state as well, l...",2010-12-27 20:42:18,c1ai18u,AskReddit
211098,"Actually, being in Egypt, the one I hear the m...",2010-12-27 20:42:14,c1ai18j,AskReddit


## Read comments

### Read `1` comments `csv` file

In [None]:
fpath = get_fpath_subr_yr('askreddit', 100_000, 2009)

In [None]:
# export
def read_comm_csv(fpath):
    try:
        date_parser = lambda x: pd.to_datetime(x, unit='s', errors='coerce')
        comments = pd.read_csv(
            fpath,
            usecols=['id', 'created_utc', 'subreddit', 'body'],
            dtype={
                'id': 'string',
                'created_utc': int,
                'subreddit': 'string',
                'body': 'string'
            },
            parse_dates=['created_utc'],
            date_parser=date_parser,
            low_memory=False,
            lineterminator='\n'
        )
        comments_clean = comments\
            .dropna()\
            .drop_duplicates(subset='id')
        return comments_clean
    except FileNotFoundError:
        print(f'{fpath} not found on disk')
    except pd.errors.EmptyDataError:
        print(f'{fpath} is empty')

In [None]:
comments = read_comm_csv(fpath)

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99999 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   body         99999 non-null  string        
 1   created_utc  99999 non-null  datetime64[ns]
 2   id           99999 non-null  string        
 3   subreddit    99999 non-null  string        
dtypes: datetime64[ns](1), string(3)
memory usage: 3.8 MB


### Read multiple comment `csv` files

In [None]:
# export
def read_comm_csvs(fpaths):
    comments_lst = []
    for fpath in fpaths:
        comments = read_comm_csv(fpath)
        comments_lst.append(comments)
    comments_concat = pd.concat(
        comments_lst,
        axis=0,
        ignore_index=True
    )
    return comments_concat

In [None]:
fpaths = get_fpaths_subr_yrs(SUBREDDIT, LIMIT, YEARS)

In [None]:
comments = read_comm_csvs(fpaths)

ValueError: Invalid file path or buffer object type: <class 'method'>

In [None]:
comments.value_counts('subreddit')

subreddit
politics    1400
dtype: int64

In [None]:
assert comments.shape == (1400, 4)

## Parse dates

In [None]:
# export
def parse_dates(comments):
    comments['created_utc'] = pd.to_datetime(comments['created_utc'], errors='coerce')
    comments.sort_values('created_utc', inplace=True)
    comments.dropna(subset=['created_utc'], inplace=True)
    return comments

In [None]:
comments = parse_dates(comments)

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400 entries, 99 to 1300
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   body         1400 non-null   object        
 1   created_utc  1400 non-null   datetime64[ns]
 2   id           1400 non-null   object        
 3   subreddit    1400 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 54.7+ KB


# Export notebooks

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_processing.ipynb.
Converted 01_installation.ipynb.
Converted 02_read_data.ipynb.
Converted 03_clean_data.ipynb.
Converted 04_usage_intensity.ipynb.
Converted index.ipynb.
