In [None]:
# default_exp read_data
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Reading the data

> This notebook covers reading the Reddit data.

## Imports

In [None]:
# export
from glob import glob
import pandas as pd

## `Google Cloud Storage` authentication

## Get file paths

### For lexeme data

In [None]:
#export
def get_fpaths_lex(LEX, CORPUS_DIR='data/', source='local', bucket_name='socemb'):
    if source == 'remote':
        client = storage.Client()
        blobs = [blob for blob in client.list_blobs(bucket_name, prefix=f'comments/{LEX}')]
        fpaths = [f'gs://{bucket_name}/{blob.name}' for blob in blobs]
    if source == 'local':
        lex_path = f'{CORPUS_DIR}{LEX}' + "/*.csv"
        fpaths = glob(lex_path)
    return fpaths    

In [None]:
fpaths_lex = get_fpaths_lex('Anglo-Saxon', source='local')

In [None]:
fpaths_lex

['data/Anglo-Saxon/Anglo-Saxon_2006-02-04--2020-08-13.csv',
 'data/Anglo-Saxon/Anglo-Saxon_2019-12-31--2020-10-06.csv']

In [None]:
assert len(fpaths_lex) == 2

### For subreddit data

In [None]:
SUBREDDIT = 'politics'
LIMIT = 100
YEAR = 2007
YEARS = [year for year in range(2007, 2021)]

In [None]:
fpath_comm_subr_yr = f'data/subreddit/{SUBREDDIT}/{LIMIT}_{YEAR}.csv'

In [None]:
# export
def get_fpath_subr_yr(SUBREDDIT, LIMIT, YEAR):
    return f'data/subreddit/{SUBREDDIT}/{LIMIT}_{YEAR}.csv'

In [None]:
get_fpath_subr_yr(SUBREDDIT, LIMIT, YEAR) == 'data/subreddit/politics/100_2007.csv'

True

In [None]:
# export
def get_fpaths_subr_yrs(SUBREDDIT, LIMIT, YEARS):
    fpaths = [get_fpath_subr_yr(SUBREDDIT, LIMIT, year) for year in YEARS]
    return fpaths

In [None]:
assert len(get_fpaths_subr_yrs(SUBREDDIT, LIMIT, YEARS)) == 14

## Read comments

### Read comments `csv` file

In [None]:
fpath = 'data/subreddit/politics/100_2007.csv'

In [None]:
# export
def read_comm_csv(fpath):
    df = pd.read_csv(
        fpath,
        usecols=['id', 'created_utc', 'subreddit', 'body'],
    )
    return df

In [None]:
comments = read_comm_csv(fpath)

In [None]:
assert len(comments) == 100

### Concatenate comment `csv` files

In [None]:
# export
def concat_comment_csvs(fpaths):
    comments_lst = []
    for fpath in fpaths:
        comments = read_comm_csv(fpath)
        comments_lst.append(comments)
    comments_concat = pd.concat(
        comments_lst,
        axis=0,
        ignore_index=True
    )
    return comments_concat

In [None]:
fpaths = get_fpaths_subr_yrs(SUBREDDIT, LIMIT, YEARS)

In [None]:
comments = concat_comment_csvs(fpaths)

In [None]:
comments

Unnamed: 0,body,created_utc,id,subreddit
0,SUP BRO,1199055494,c02s16i,politics
1,You should have given the blog dude a piece of...,1199055485,c02s16h,politics
2,I DONT THINK THERE IS A STONG CLAN PRESENCE IN...,1199055470,c02s16d,politics
3,The problem is that the absurd is not preventi...,1199055459,c02s16a,politics
4,[deleted],1199055396,c02s162,politics
...,...,...,...,...
1395,"Actually, if you read the user agreement for P...",1609369108,ghjplk1,politics
1396,"The greedy don’t want to share, the rich have ...",1609369107,ghjplil,politics
1397,The reality is that it probably won't even tak...,1609369105,ghjplbz,politics
1398,He just invigorated her voters unwillingly,1609369104,ghjplaq,politics


In [None]:
assert comments.shape == (1400, 4)

## Parse dates

In [None]:
# export
def parse_dates(comments):
    comments['created_utc'] = pd.to_datetime(comments['created_utc'], errors='coerce')
    comments.sort_values('created_utc', inplace=True)
    comments.dropna(subset=['created_utc'], inplace=True)
    return comments

In [None]:
comments = parse_dates(comments)

In [None]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400 entries, 99 to 1300
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   body         1400 non-null   object        
 1   created_utc  1400 non-null   datetime64[ns]
 2   id           1400 non-null   object        
 3   subreddit    1400 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 54.7+ KB


# Export notebooks

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_processing.ipynb.
Converted 01_installation.ipynb.
Converted 02_read_data.ipynb.
Converted 03_clean_data.ipynb.
Converted 04_usage_intensity.ipynb.
Converted index.ipynb.
