# Setup & File Load

In [None]:
# import required packages
import os
import pandas as pd
import numpy
import mysql.connector
import transform


# set max number of columns & rows to display
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

In [None]:
# run this cell to load data from a csv file
df = pd.read_csv('./data/route_15A.csv', sep=";", na_values=['\\N'])

In [None]:
# perform a check to see how many rows and columns are in the file
rows = df.shape[0]
cols = df.shape[1]
print()
print("Before any data cleaning, the CSV file contains", rows, "rows and", cols, "columns.")
print()

In [None]:
df.head(5)

# Initial Checks on the Data

- Duplicate rows and columns
- Null/empty features
- Assign features as categorical or continuous
- Constant features

## Check for Duplicate Rows & Columns

In [None]:
# Check for duplicate rows
print()
print('Duplicate rows:', df.duplicated()[df.duplicated() == True].shape[0])
# Check for duplicate columns
print('Duplicate columns:',df.columns.size - df.columns.unique().size)

There are no duplicate rows or columns so nothing needs to be dropped here.

## Check for Null/Empty Features

In [None]:
# Perform a check for null/empty columns
df.describe().T

Features with count of zero can be dropped as they contain no information.

In [None]:
# drop null columns
df = df.drop(columns=['tenderlot', 'suppressed_trip', 'justificationid_trip', 'passengers', 'passengersin', 'passengersout', 'distance_leavetimes', 'note_leavetimes', 'note_vehicle'])

## Assign Features as Continuous or Categorical

First check the data types of all rows after the file import.

In [None]:
# print data types of all rows
df.dtypes

Assign categorical and continous features, and update the type of all categorical features to 'category'.

In [None]:
# Select columns containing categorical data
categorical_columns = df[['datasource', 'dayofservice', 'tripid', 'lineid', 'routeid', 'direction', 'basin', \
                         'lastupdate_trip', 'note_trip', 'progrnumber', 'stoppointid', \
                          'suppressed_leavetimes', 'lastupdate_leavetimes']].columns

# Convert data type to 'Category' for these columns
for column in categorical_columns:
    df[column] = df[column].astype('category')

In [None]:
# Select columns containing continuous data 
# This is done by selecting columns with a numeric type - float64 or int64
continuous_columns = df.select_dtypes(['float64', 'int64']).columns

## Check for Constant Categorical Features

In [None]:
# Print details for the categorical columns
df[categorical_columns].describe().T

In [None]:
# drop constant features
df = df.drop(columns=['datasource', 'basin'])

## Check for Constant Continuous Features

In [None]:
# Print details for the continuous columns
df[continuous_columns].describe().T

There are no constant continuous features so nothing needs to be dropped.

# Further Analysis of Features

- Features that don't provide much information
- Features that we won't be able to provide information on to the model

In [None]:
# drop features we won't use
df = df.drop(columns=['lastupdate_trip', 'note_trip', 'suppressed_leavetimes', 'justificationid_leavetimes', \
                      'lastupdate_leavetimes','vehicleid', 'distance_vehicle', 'minutes_vehicle'])

In [None]:
df.head(5)

In [None]:
df.dtypes

# Initial Checks for Missing Data

## Categorical Features

In [None]:
# Select columns containing categorical data
categorical_columns = df[['dayofservice', 'tripid', 'lineid', 'routeid', 'direction', 'progrnumber', 'stoppointid']].columns

In [None]:
# Print details for the categorical columns
df[categorical_columns].describe().T

There is a full count for all categorical features.

## Continuous Features

In [None]:
# Select columns containing continuous data 
# This is done by selecting columns with a numeric type - float64 or int64
continuous_columns = df.select_dtypes(['float64', 'int64']).columns

In [None]:
# Print details for the continuous columns
df[continuous_columns].describe().T

There are some rows missing data for **actualtime_arr_trip** and **actualtime_dep_trip**. This will be reviewed if these features are used in the future, currently they are not carried across when data is transformed.

# Transform the Data

In [None]:
df_transformed = transform.transform_data(df)

In [None]:
df_transformed

# Check for Missing Data

First re-assign the transformed data as continuous or categorical.

In [None]:
df_transformed.dtypes

In [None]:
# Select columns containing categorical data
categorical_columns = df_transformed[['dayofservice', 'tripid', 'lineid', 'routeid', 'direction',  \
                         'progrnumber_first', 'stoppointid_first', \
                          'progrnumber_next', 'stoppointid_next']].columns

# Convert data type to 'Category' for these columns
for column in categorical_columns:
    df_transformed[column] = df_transformed[column].astype('category')

In [None]:
# Select columns containing continuous data 
# This is done by selecting columns with a numeric type - float64 or int64
continuous_columns = df_transformed.select_dtypes(['float64', 'int64']).columns

Then check for missing data.

In [None]:
# Print details for the categorical columns
df_transformed[categorical_columns].describe().T

In [None]:
# Print details for the continuous columns
df_transformed[continuous_columns].describe().T

## Drop Rows with Missing Data

In [None]:
df_transformed = df_transformed[pd.notnull(df_transformed['stoppointid_first'])]

In [None]:
df_transformed = df_transformed[pd.notnull(df_transformed['stoppointid_next'])]

In [None]:
# Print details for the categorical columns
df_transformed[categorical_columns].describe().T

In [None]:
# Print details for the continuous columns
df_transformed[continuous_columns].describe().T

# Data Quality Plan - Before Data Transformation

| Feature | Data Quality Issue | Handling Strategy |
|-------------------------|----------------------|------------------------------|
| tenderlot       | All rows are null | Drop feature |
| suppressed_trip | All rows are null | Drop feature |
| justificationid_trip | All rows are null | Drop feature |
| passengers | All rows are null | Drop feature | 
| passengersin | All rows are null | Drop feature |
| passengersout | All rows are null | Drop feature |
| distance_leavetimes | All rows are null | Drop feature |
| note_leavetimes | All rows are null | Drop feature |
| note_vehicle | All rows are null | Drop feature |
| datasource | Constant feature | Drop feature |
| lineid | Constant feature | This is constant because we just have data for one route loaded. At some point we may process more than one route together so will keep feature for now. May not be needed to train the model. |
| basin | Constant feature | Drop feature |
| lastupdate_trip | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| note_trip | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| suppressed_leavetimes | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| justifcationid_leavetimes | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| lastupdate_leavetimes | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| vehicleid | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| distance_vehicle | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| minutes_vehicle | Cannot be used to train model as we won't be able to provide this information | Drop feature |
| actualtime_arr_trip | Missing values < 1% | Ignore for now as this feature is not brought across when data is transformed. |
| actualtime_dep_trip | Missing values < 3% | Ignore for now as this feature is not brought across when data is transformed. |

# Data Quality Plan - After Data Transformation

| Feature | Data Quality Issue | Handling Strategy |
|-------------------------|----------------------|------------------------------|
| stoppointid_first | Missing values ~ 1% | Drop affected rows |
| actualtime_arr_stop_first | Missing values ~ 1%| Drop affected rows |
| stoppointid_next | Missing values ~ 1% | Drop affected rows |
| actualtime_arr_stop_next | Missing values ~ 1%| Drop affected rows |

# Tests for Transforming the Data

In [None]:
df_test1 = df.loc[5:100]
df_test1 = df_test1.reset_index(drop=True)
df_test1

In [None]:
df_transformed1 = transform.transform_data(df_test1)
df_transformed1

In [None]:
pieces = [df[:35], df[42:100]]
df_test2 = pd.concat(pieces)
df_test2 = df_test2.reset_index(drop=True)
df_test2

In [None]:
df_transformed2 = transform.transform_data(df_test2)
df_transformed2

In [None]:
pieces = [df[:5], df[10:100]]
df_test3 = pd.concat(pieces)
df_test3 = df_test3.reset_index(drop=True)
df_test3

In [None]:
df_transformed3 = transform.transform_data(df_test3)
df_transformed3

In [None]:
pieces = [df[:5], df[8:10], df[14:50]]
df_test4 = pd.concat(pieces)
df_test4 = df_test4.reset_index(drop=True)
df_test4

In [None]:
df_transformed4 = transform.transform_data(df_test4)
df_transformed4