In [None]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

import dask.dataframe as dd
from dask.distributed import Client
import pandas as pd
import plotly.graph_objects as go

%load_ext nb_black

## Read smaller csv files using pandas

In [None]:
df = pd.read_csv(
    "../data_pipeline/data/csv_partition/news_csv_02",
    usecols=["date", "title", "article", "section"],
    dtype={"date": str, "title": str, "article": str, "section": str,},
    engine="c",
    encoding="utf8",
    quoting=0,
    quotechar='"',
    doublequote=True,
    parse_dates=["date"],
)
df

## Investigate date distribution

In [None]:
date_list_csv_00 = df.date.dt.date.value_counts()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=date_list_csv_00.index,
        y=date_list_csv_00.values,
        marker_color="rgb(26, 118, 255)",
    )
)
fig.update_layout(title_text="Number of Articles Per Day for First CSV")
fig.show()

## Read processed parquet file using dask

In [None]:
client = Client()

In [None]:
df = dd.read_parquet("../data_pipeline/data/news_v1.parquet",)
df

In [None]:
date_list = df.date.dt.date.value_counts().compute()

In [None]:
date_list.describe()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(x=date_list.index, y=date_list.values, marker_color="rgb(26, 118, 255)",)
)
fig.update_layout(title_text="Number of Articles Per Day for Processed Parquet")
fig.show()

In [None]:
section_list = df["section"].value_counts().compute()

In [None]:
section_list.head(50)

In [None]:
na_df = df[df.isna().any(axis=1)].compute()

In [None]:
na_df.iloc[0].title

## Using pandas to read smaller processed parquet file

In [None]:
df_0 = pd.read_parquet(
    "../data_pipeline/data/news_v1.parquet/part.0.parquet", engine="pyarrow"
)

In [None]:
len(df_0)

In [None]:
for i in range(10):
    print(df_0[df_0["section"] == "Financials"].reset_index()["title"][i])

In [None]:
df_0[df_0["section"] == "Financials"]

In [None]:
news_frequency_0 = df_0.date.value_counts().compute()

In [None]:
news_day_frequency_0 = news_frequency_0.groupby(pd.Grouper(freq="D")).count()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=news_day_frequency_0.index,
        y=news_day_frequency_0.values,
        marker_color="rgb(26, 118, 255)",
    )
)
fig.update_layout(title_text="Number of Articles Per Day for First Partition")
fig.show()

In [None]:
news_frequency.index = pd.to_datetime(news_frequency.index, errors="coerce")

In [None]:
news_day_frequency = news_frequency.groupby(pd.Grouper(freq="D")).count()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=news_day_frequency.index,
        y=news_day_frequency.values,
        marker_color="rgb(26, 118, 255)",
    )
)
fig.update_layout(title_text="Number of Articles Per Day")
fig.show()

In [None]:
df_full = pd.read_csv(
    "../data_pipeline/data/all-the-news-2-1.csv",
#     chunksize=100,
    index_col=0,
    usecols=["date", "title", "article", "section"],
    parse_dates=["date"],
)