# Feature Engineering Script

Input variables: 48 + 1 (id)

In [1]:
# importing required packages
import pandas as pd
import pandas_profiling
import numpy as np
import os
import featuretools as ft

# To install packages, run on cmd:
# pip3 install pandas-profiling
# pip3 install featuretools

In [2]:
# Choose the city of analysis

city = 'Recife'
# city = 'CG'
# city = 'Curitiba'

In [3]:
# Put the integrated data path

integrated_data_path = os.getcwd() + "/../data/output/" + city + "/"

In [4]:
# Importing the data

output_files = []
days_of_analysis = 0

for dir_name in os.listdir(integrated_data_path):
    
    if dir_name.startswith("output_"): #to get the dir per day
        dir_path = integrated_data_path + dir_name
        days_of_analysis = days_of_analysis+1
        
        for file_name in os.listdir(dir_path):
            
            if file_name.startswith("part-"):
                file_path = dir_path + "/" + file_name
                
                # Join multiple files of the same day
                df = pd.read_csv(file_path, index_col=None, header=0)
                output_files.append(df)
                
print('Days of analysis: ', days_of_analysis)

# Join all output files            
all_integrated_data = pd.concat(output_files, axis=0, ignore_index=True)

all_integrated_data.head()

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Days of analysis:  12


Unnamed: 0,route,tripNum,shapeId,routeFrequency,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,...,jamDelay,jamLength,jamLevel,jamSeverity,jamSpeedKM,jamDistanceToClosestShapePoint,headway,busBunching,nextBusCode,GPShour
0,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50661,,...,,,,,,,26,False,50671,4
1,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50671,409090.0,...,,,,,,,25,False,50669,5
2,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50669,406112.0,...,,,,,,,9,False,50662,5
3,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50662,,...,,,,,,,7,False,50663,5
4,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50663,,...,,,,,,,11,False,50760,5


The following variables are objetc type and should be converted to the origin types because the correlation matrix.

In [5]:
all_integrated_data.dtypes

route                                 int64
tripNum                               int64
shapeId                              object
routeFrequency                       object
shapeSequence                         int64
shapeLat                            float64
shapeLon                            float64
distanceTraveledShape               float64
busCode                               int64
gpsPointId                          float64
gpsLat                              float64
gpsLon                              float64
distanceToShapePoint                float64
gps_datetime                         object
stopPointId                           int64
problem                              object
precipitation                       float64
precipitationTime                    object
alertDateTime                        object
alertSubtype                         object
alertType                            object
alertRoadType                       float64
alertConfidence                 

In [6]:
all_integrated_data['shapeId'] = all_integrated_data['shapeId'].astype(str)
all_integrated_data['routeFrequency'] = all_integrated_data['routeFrequency'].astype(str)
all_integrated_data['gps_datetime'] = all_integrated_data['gps_datetime'].astype('datetime64[ns]')
all_integrated_data['problem'] = all_integrated_data['problem'].astype(str)
all_integrated_data['precipitationTime'] = all_integrated_data['precipitationTime'].astype('datetime64[ns]')
all_integrated_data['alertDateTime'] = all_integrated_data['alertDateTime'].astype('datetime64[ns]')
all_integrated_data['alertSubtype'] = all_integrated_data['alertSubtype'].astype(str)
all_integrated_data['alertSubtype'].replace("nan", "-", inplace=True)
all_integrated_data['alertType'] = all_integrated_data['alertType'].astype(str)
all_integrated_data['alertType'].replace("nan", "-", inplace=True)
all_integrated_data['alertIsJamUnifiedAlert'] = all_integrated_data['alertIsJamUnifiedAlert'].astype(bool)
all_integrated_data['alertInScale'] = all_integrated_data['alertInScale'].astype(bool)
all_integrated_data['jamUpdateDateTime'] = all_integrated_data['jamUpdateDateTime'].astype('datetime64[ns]')
all_integrated_data['jamExpirationDateTime'] = all_integrated_data['jamExpirationDateTime'].astype('datetime64[ns]')
all_integrated_data['jamBlockType'] = all_integrated_data['jamBlockType'].astype(str)
all_integrated_data['jamBlockType'].replace("nan", "-", inplace=True)

all_integrated_data.dtypes

route                                        int64
tripNum                                      int64
shapeId                                     object
routeFrequency                              object
shapeSequence                                int64
shapeLat                                   float64
shapeLon                                   float64
distanceTraveledShape                      float64
busCode                                      int64
gpsPointId                                 float64
gpsLat                                     float64
gpsLon                                     float64
distanceToShapePoint                       float64
gps_datetime                        datetime64[ns]
stopPointId                                  int64
problem                                     object
precipitation                              float64
precipitationTime                   datetime64[ns]
alertDateTime                       datetime64[ns]
alertSubtype                   

In [None]:
# Descriptive statistics
profile = pandas_profiling.ProfileReport(all_integrated_data, check_correlation=True, correlation_threshold=0.5)

In [None]:
profile.to_file(outputfile=output_path + "output_profile.html")

In [None]:
# deleting jamExpirationDateTime because there is no value
all_integrated_data.drop('jamExpirationDateTime', axis=1, inplace=True)

* Maybe it didn't measuring correlation because a lot of variables.

### 2. Creating new features


#### 2.1 With Feature Tools

In [8]:
# Creating an entity set
es = ft.EntitySet(id="all_data")

entities = es.entity_from_dataframe(entity_id="id", dataframe=all_integrated_data, make_index=True, index="id", time_index='gps_datetime')
entities

Entityset: all_data
  Entities:
    id [Rows: 1643336, Columns: 49]
  Relationships:
    No relationships

In [9]:
entities.entities

[Entity: id
   Variables:
     id (dtype: index)
     route (dtype: numeric)
     tripNum (dtype: numeric)
     shapeId (dtype: categorical)
     routeFrequency (dtype: categorical)
     shapeSequence (dtype: numeric)
     shapeLat (dtype: numeric)
     shapeLon (dtype: numeric)
     distanceTraveledShape (dtype: numeric)
     busCode (dtype: numeric)
     gpsPointId (dtype: numeric)
     gpsLat (dtype: numeric)
     gpsLon (dtype: numeric)
     distanceToShapePoint (dtype: numeric)
     gps_datetime (dtype: datetime_time_index)
     stopPointId (dtype: numeric)
     problem (dtype: categorical)
     precipitation (dtype: numeric)
     precipitationTime (dtype: datetime)
     alertDateTime (dtype: datetime)
     alertSubtype (dtype: categorical)
     alertType (dtype: categorical)
     alertRoadType (dtype: numeric)
     alertConfidence (dtype: numeric)
     alertNComments (dtype: numeric)
     alertNImages (dtype: numeric)
     alertNThumbsUp (dtype: numeric)
     alertReliability (dt

**Para a variável alvo (dados de ônibus por parada de ônibus), podemos gerar as seguintes variáveis:**

In [10]:
feature_matrix, features_defs = ft.dfs(entityset=entities, target_entity="id")

In [12]:
# Saving file
output_path = integrated_data_path + "new_feats_data.csv"
feature_matrix.to_csv(output_path, index=False)

feature_matrix.head(5)

Unnamed: 0_level_0,route,tripNum,shapeId,routeFrequency,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,...,MONTH(gps_datetime),MONTH(precipitationTime),MONTH(alertDateTime),MONTH(jamUpdateDateTime),MONTH(jamExpirationDateTime),WEEKDAY(gps_datetime),WEEKDAY(precipitationTime),WEEKDAY(alertDateTime),WEEKDAY(jamUpdateDateTime),WEEKDAY(jamExpirationDateTime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50661,,...,12,12,12.0,,,2,2,2.0,,
1,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50671,409090.0,...,12,12,12.0,,,2,2,2.0,,
2,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50669,406112.0,...,12,12,12.0,,,2,2,2.0,,
3,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50662,,...,12,12,12.0,,,2,2,2.0,,
4,96,1,Route_219,low_frequency,22857,-8.046589,-34.878125,3606.0,50663,,...,12,12,12.0,,,2,2,2.0,,


In [11]:
features_defs

[<Feature: route>,
 <Feature: tripNum>,
 <Feature: shapeId>,
 <Feature: routeFrequency>,
 <Feature: shapeSequence>,
 <Feature: shapeLat>,
 <Feature: shapeLon>,
 <Feature: distanceTraveledShape>,
 <Feature: busCode>,
 <Feature: gpsPointId>,
 <Feature: gpsLat>,
 <Feature: gpsLon>,
 <Feature: distanceToShapePoint>,
 <Feature: stopPointId>,
 <Feature: problem>,
 <Feature: precipitation>,
 <Feature: alertSubtype>,
 <Feature: alertType>,
 <Feature: alertRoadType>,
 <Feature: alertConfidence>,
 <Feature: alertNComments>,
 <Feature: alertNImages>,
 <Feature: alertNThumbsUp>,
 <Feature: alertReliability>,
 <Feature: alertReportMood>,
 <Feature: alertReportRating>,
 <Feature: alertSpeed>,
 <Feature: alertLatitude>,
 <Feature: alertLongitude>,
 <Feature: alertDistanceToClosestShapePoint>,
 <Feature: alertIsJamUnifiedAlert>,
 <Feature: alertInScale>,
 <Feature: jamBlockType>,
 <Feature: jamDelay>,
 <Feature: jamLength>,
 <Feature: jamLevel>,
 <Feature: jamSeverity>,
 <Feature: jamSpeedKM>,
 <Featu

About feature types:
* It works with data type, but generate just mean, sum, count, mode...