In [None]:
# IMPORTING MODULES
import os
import pandas as pd
# import plotly.express as px
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objects as go


## Setting up path variables for easy and [generalized ?] access throughout the notebook

In [None]:
# SETTING PATH VARIABLES
JUDICIAL_DATA_CSV_FOLDER_PATH = "/mnt/e/judicial_data_csv/"

ACTS_SECTIONS_CSV = os.path.join(
                        JUDICIAL_DATA_CSV_FOLDER_PATH, 
                        "acts_sections.csv"
                    )

CASES_CSV_FOLDER = os.path.join(
                        JUDICIAL_DATA_CSV_FOLDER_PATH, 
                        "cases/cases/"
                    )

JUDGES_CSV = os.path.join(
                        JUDICIAL_DATA_CSV_FOLDER_PATH,
                        "judges_clean/judges_clean.csv"
                    )
KEYS_CSV = os.path.join(
                        JUDICIAL_DATA_CSV_FOLDER_PATH, 
                        "keys.csv"
                    )



# To test if the file is being read properly #

# for dirname, _, filenames in os.walk(JUDICIAL_DATA_CSV_FOLDER_PATH):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))



### Analyzing cases csv data 

- We will read the csv file first
- We will later change this to a for-loop so we can loop through every csv file

#### Bibliography
- https://stackoverflow.com/questions/50089903/convert-column-to-timestamp-pandas-dataframe
- https://www.geeksforgeeks.org/get-minimum-values-in-rows-or-columns-with-their-index-position-in-pandas-dataframe/
- https://stackoverflow.com/questions/19324453/add-missing-dates-to-pandas-dataframe
- https://www.w3schools.com/python/pandas/ref_df_min.asp#:~:text=The%20min()%20method%20returns,minimum%20value%20for%20each%20row.
- https://www.geeksforgeeks.org/how-to-sort-a-pandas-dataframe-by-date/
- https://www.datasciencemadesimple.com/create-frequency-table-of-column-in-pandas-python-2/
- https://datagy.io/pandas-value-counts/
- https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
- https://stackoverflow.com/questions/32888124/pandas-out-of-bounds-nanosecond-timestamp-after-offset-rollforward-plus-adding-a
-  https://stackoverflow.com/questions/19324453/add-missing-dates-to-pandas-dataframe
-  https://stackoverflow.com/questions/26097916/convert-pandas-series-to-dataframe
-  https://stackoverflow.com/questions/61233041/module-not-found-error-no-module-named-chart-studio
-  https://plotly.com/python/ipython-notebook-tutorial/

In [None]:
df = pd.read_csv(os.path.join(CASES_CSV_FOLDER, "cases_2014.csv"))
df.head()


### Plotting the number of pending cases 

#### How the data was preprocessed and analyzed?
- This was achieved by using the `date-of-filing` and `date-of-decision` columns
- A case is considered pending on a given date if the `date-of-filing` has passed but the `date-of-decision` has not been passed.
- The columns `date-of-filing` and `date-of-decision` were cleaned using the `Pandas.to_datetime()` method with the parameter `errors = coerce` to fill `NaT` in entries which were not in the correct `TimeStamp` format.
- The previous steps ensures that there are no faulty entries in either columns.
- A frequency map was created using `DataFrame[column].value_counts()`.
- Frequency of non occurent dates were filled with zeros using the `PandaSeries.reindex(range_of_dates, fill_value=0)` method.
- 

In [None]:
df['date_of_filing'] = pd.to_datetime(
    df['date_of_filing'], errors='coerce'
)

df['date_of_decision'] = pd.to_datetime(
    df['date_of_decision'], errors='coerce'
)

min_date = df[['date_of_filing', 'date_of_decision']].min().min()
max_date = df[['date_of_filing', 'date_of_decision']].max().max()

range_of_dates = pd.date_range(min_date, max_date)

freq_filed_cases = df['date_of_filing'].value_counts()
freq_filed_cases.index = pd.DatetimeIndex(freq_filed_cases.index)
freq_filed_cases = freq_filed_cases.reindex(range_of_dates, fill_value=0)
# freq_filed_cases.head()

freq_solved_cases = df['date_of_decision'].value_counts()
freq_solved_cases.index = pd.DatetimeIndex(freq_solved_cases.index)
freq_solved_cases = freq_solved_cases.reindex(range_of_dates, fill_value=0)
print(freq_solved_cases)

freq_pending_cases = freq_filed_cases.subtract(freq_solved_cases)

df_pending_cases = pd.DataFrame({
    'date'          : freq_pending_cases.index, 
    'num_pending_cases' : freq_pending_cases.values
})
df_pending_cases['cum_pending_cases'] = df_pending_cases['num_pending_cases'].cumsum()
# df_pending_cases.head()


# table = ff.create_table(df_pending_cases)
# py.iplot(table, filename='Pending Cases during 2010')
# df_pending_cases.plot(
#         x = 'date', 
#         y = 'cum_pending_cases', 
#         xlim = (min_date, max_date),
#         title = 'Number of pending cases over time'
# )
