In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pathlib import Path

In [2]:
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [3]:
charlotte = pd.read_csv('data/nc_charlotte_2020_04_01.csv')
durham = pd.read_csv('data/nc_durham_2020_04_01.csv')
fayetteville = pd.read_csv('data/nc_fayetteville_2020_04_01.csv')
greensboro = pd.read_csv('data/nc_greensboro_2020_04_01.csv')
raleigh = pd.read_csv('data/nc_raleigh_2020_04_01.csv')
winston_salem = pd.read_csv('data/nc_winston-salem_2020_04_01.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/nc_charlotte_2020_04_01.csv'

In [None]:
combined = pd.concat([charlotte, durham, fayetteville, greensboro, raleigh, winston_salem], axis=0)
combined = combined.reset_index(drop=True)
combined

## Save train and test as csv files

In [None]:
train, test = train_test_split(combined, test_size=0.2, 
                               stratify = combined['department_name']) # stratify according to county

In [None]:
train_filepath = Path('train.csv')  
train_filepath.parent.mkdir(parents=True, exist_ok=True)  
train.to_csv(train_filepath, index = False)

test_filepath = Path('test.csv')  
test_filepath.parent.mkdir(parents=True, exist_ok=True)  
test.to_csv(test_filepath, index = False)

## Drop Missing Rate > 95%

In [None]:
def drop_high_missing_rate(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    stat_df = df.describe(include='all').T
    stat_df['missing_rate'] = missing_value_df['percent_missing']
    stat_df = stat_df.drop(columns = ['count'])
    
    high_missing_rate = stat_df[stat_df.missing_rate > 95].index.values
    df = df.drop(high_missing_rate, axis=1)
    
    return df

processed_train = drop_high_missing_rate(train).reset_index(drop=True)
processed_train

## Preprocessing - combine date and time

In [None]:
def to_date_time(df):
    date_time = pd.to_datetime(df['date'] + df['time'], 
                               format='%Y-%m-%d%H:%M:%S') # allow comparing date time
    df['date_time'] = date_time
    df = df.drop(['date','time'],axis=1)
    
    return df

processed_train = to_date_time(processed_train)
processed_train

## Fill NA

In [None]:
def fillna(df):
    for col in df.columns:
        if col == 'subject_age':
            df[col].fillna(df[col].mean(), inplace = True) # for numeric, fill mean
        else:
            df[col].fillna(df[col].mode().values[0], inplace = True) # for categoric fill mode
    return df
processed_train = fillna(processed_train)
processed_train

## Save processed_train as csv

In [None]:
train_filepath = Path('processed_train.csv')  
train_filepath.parent.mkdir(parents=True, exist_ok=True)  
processed_train.to_csv(train_filepath, index = False)

In [None]:
# importing the library
from pandas_profiling import ProfileReport

ProfileReport(processed_train)