# Import relevant packages

In [35]:

import glob
import numpy as np
import pandas as pd
import re

from datetime import datetime

# Consolidate all excel files into a pandas df

In [2]:
# folder containing excel files
datapath = "./backup/"

# set all .xlsx files in folder to a list
allfiles = glob.glob(datapath + "*.xlsx")
df = pd.DataFrame()

# for loop to aquire all excel files in folder
for f in allfiles:
    data = pd.read_excel(f, 'Sheet1')
    df = df.append(data)

# Start parsing..

In [26]:
df_clean = df.copy()
df_clean.shape[0]

48267

In [27]:
df_clean.reset_index(drop=True, inplace=True)
df_clean.drop(columns=['Unnamed: 0', 'id'], inplace=True)

In [28]:
df_clean['complaint'] = df_clean['complaint'].fillna('')

#cleaning [''] inside relevant values
for col in df_clean.columns:
    try:
        df_clean[col] = df_clean[col].apply(lambda x: x.strip("['']") if x==x else x) 
    except:
        pass

#feature engineering 
df_clean['ID'] = df_clean['details'].apply(lambda x: re.findall('ID: (\d{9})', x))
df_clean['date'] = df_clean['details'].apply(lambda x: re.findall('\d\d/\d\d/\d\d', x))
df_clean['hour'] = df_clean['details'].apply(lambda x: re.findall('às (\d\dh\d\d)', x))
df_clean['city'] = df_clean['details'].apply(lambda x: re.findall('^(.*) - \w\w ID', x))
df_clean['state'] = df_clean['details'].apply(lambda x: re.findall('(\w\w) ID', x))
df_clean['len_complaint'] = df_clean['complaint'].apply(lambda x: len(str(x)))
df_clean['len_words'] = df_clean['complaint'].apply(lambda x: len(str(x).split()))

#'unlisting' certain columns
df_clean = df_clean.applymap(lambda x: x if not isinstance(x, list) else x[0] if len(x) else '')
df_clean['hour'] = df_clean['hour'].apply(lambda x: re.sub('h', ':', x))

#dropping non-relevant columns after parsing
df_clean.drop(columns='details', inplace=True)
df_clean['complaint'] = df_clean['complaint'].apply(lambda x: re.sub(r'\u201c|\u201d|\u2018|\u2019', '-',x)) 


In [37]:
df_clean['date'] = pd.to_datetime(df_clean['date'], format='%d/%m/%y')

# Saving and closing file

In [29]:
df_clean.reset_index(drop=False, inplace=True)
df_clean.to_csv('./final/final_dataset.csv', sep=';', encoding = 'iso-8859-1')

In [31]:
df_clean.drop(columns='complaint', inplace=True)

In [38]:
df_clean.to_excel('./final/final_dataset_wo_complaint.xlsx',encoding = 'iso-8859-1')