In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("/Users/alejandropalacios/Desktop/Ironhack/Data Analytics Bootcamp/Advanced Data Analysis Techniques/Project/F1-Grand-Prix-Predictor/Data Manipulation/Modeling/final_df_complete.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13720 entries, 0 to 13719
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   season                             13720 non-null  int64  
 1   round                              13720 non-null  int64  
 2   circuit_id                         13720 non-null  object 
 3   weather_warm                       13720 non-null  bool   
 4   weather_cold                       13720 non-null  bool   
 5   weather_dry                        13720 non-null  bool   
 6   weather_wet                        13720 non-null  bool   
 7   weather_cloudy                     13720 non-null  bool   
 8   driver                             13720 non-null  object 
 9   nationality                        13720 non-null  object 
 10  constructor                        13720 non-null  object 
 11  grid                               13720 non-null  int

In [3]:
# The goal of this exercise will be, for every Driver, to modelate 0s for repeated Teams after every Season and 1s for different Teams. Thus we will get a binary idea of how many times a Driver switched Teams between Seasons.

# Step one, pulverize the problem into various subsets of things. Let's only get the Drivers, their Season, and with which Constructor they drove. 

drivers = dataset[['driver', 'season', 'constructor']].groupby('driver').apply(lambda a: a.drop('driver', axis=1)[:])
df = drivers.reset_index().drop(['level_1'], axis=1)

# Let's check Hamilton's career. Please note the fact that having duplicates does not mean we are going through a bad path, on the contrary, we need the duplicates to codify every row to a 0 or 1 value later.

df[df['driver'] == 'hamilton']


Unnamed: 0,driver,season,constructor
5264,hamilton,2007,mclaren
5265,hamilton,2007,mclaren
5266,hamilton,2007,mclaren
5267,hamilton,2007,mclaren
5268,hamilton,2007,mclaren
...,...,...,...
5500,hamilton,2019,mercedes
5501,hamilton,2019,mercedes
5502,hamilton,2019,mercedes
5503,hamilton,2019,mercedes


In [4]:
# now here's the trick, we will use a where clause to codify our data.

df['change_teams'] = np.where((df['driver'] == df['driver'].shift()) & (df['constructor'] == df['constructor'].shift()),0,1)


In [5]:
# Let's check Lewis career again. Sweet, we codified the thing already!

df[df['driver'] == 'hamilton']


Unnamed: 0,driver,season,constructor,change_teams
5264,hamilton,2007,mclaren,1
5265,hamilton,2007,mclaren,0
5266,hamilton,2007,mclaren,0
5267,hamilton,2007,mclaren,0
5268,hamilton,2007,mclaren,0
...,...,...,...,...
5500,hamilton,2019,mercedes,0
5501,hamilton,2019,mercedes,0
5502,hamilton,2019,mercedes,0
5503,hamilton,2019,mercedes,0


In [6]:
# For the next trick, we will design an id column to later merge the codification to the main dataframe. 

df['driver_season_id'] = df['driver'] + df['season'].astype(str)

In [7]:
# Same for the main dataset (so our left join has something to grasp from).

dataset['driver_season_id'] = dataset['driver'] + dataset['season'].astype(str)

In [11]:
# Now we merge everything together, and drop the columns which got duplicated. 

dataset = dataset.merge(df, left_on='driver_season_id', right_on='driver_season_id', how='left').drop(columns=['driver_season_id', 'driver_y', 'season_y', 'constructor_y'])

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215428 entries, 0 to 215427
Data columns (total 26 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   season_x                           215428 non-null  int64  
 1   round                              215428 non-null  int64  
 2   circuit_id                         215428 non-null  object 
 3   weather_warm                       215428 non-null  bool   
 4   weather_cold                       215428 non-null  bool   
 5   weather_dry                        215428 non-null  bool   
 6   weather_wet                        215428 non-null  bool   
 7   weather_cloudy                     215428 non-null  bool   
 8   driver_x                           215428 non-null  object 
 9   nationality                        215428 non-null  object 
 10  constructor_x                      215428 non-null  object 
 11  grid                               2154

In [15]:
# Let's finally re-name everything to its original state.

dataset.rename(columns={'season_x':'season', 'driver_x':'driver', 'constructor_x':'constructor'}, inplace=True)

In [17]:
# And yeah, here's the final stage of our First Step.

dataset[dataset['driver'] == 'hamilton']

Unnamed: 0,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,nationality,...,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age,wins_percentage,change_teams
124643,2007,1,albert_park,True,False,False,False,False,hamilton,British,...,0,0,0,0,0,0,0.683,22,0.0,1
124644,2007,1,albert_park,True,False,False,False,False,hamilton,British,...,0,0,0,0,0,0,0.683,22,0.0,0
124645,2007,1,albert_park,True,False,False,False,False,hamilton,British,...,0,0,0,0,0,0,0.683,22,0.0,0
124646,2007,1,albert_park,True,False,False,False,False,hamilton,British,...,0,0,0,0,0,0,0.683,22,0.0,0
124647,2007,1,albert_park,True,False,False,False,False,hamilton,British,...,0,0,0,0,0,0,0.683,22,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215052,2019,21,yas_marina,True,False,False,False,False,hamilton,British,...,381,10,1,701,14,1,0.000,34,0.2,0
215053,2019,21,yas_marina,True,False,False,False,False,hamilton,British,...,381,10,1,701,14,1,0.000,34,0.2,0
215054,2019,21,yas_marina,True,False,False,False,False,hamilton,British,...,381,10,1,701,14,1,0.000,34,0.2,0
215055,2019,21,yas_marina,True,False,False,False,False,hamilton,British,...,381,10,1,701,14,1,0.000,34,0.2,0
