In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import patsy
import statsmodels.api as sm

# Tsukuba Modelling

### Loading in Tsukuba Dataset

In [None]:
tsukuba = pd.read_csv("data/tsukuba_ac.csv").iloc[:,:26]
tsukuba = tsukuba[tsukuba['P/W (hp/t)'] != '#DIV/0!']
tsukuba['P/W (hp/t)'] = tsukuba['P/W (hp/t)'].apply(np.float64)
tsukuba['Year'] = tsukuba['Year'].apply(np.float64)
tsukuba.dropna(axis=0, how='all', inplace=True)


def lt_in_seconds(time):
    minutes, seconds = time.strip().split(':')
    return 60*np.float64(minutes) + np.float64(seconds)

def msrp_float(msrp):
    if pd.isna(msrp):
        return np.float64(None)
    else:
        msrp = msrp.replace('$','').replace(',','').replace('>','')
        return np.float64(msrp)

tsukuba['MSRP (2022 USD)'] = tsukuba['MSRP (2022 USD)'].apply(msrp_float)

z='0:00.000'
tsukuba['Lap Time I']= tsukuba['Lap Time I'].fillna(z).apply(lt_in_seconds)
tsukuba['Lap Time II']= tsukuba['Lap Time II'].fillna(z).apply(lt_in_seconds)
tsukuba['Lap Time III']= tsukuba['Lap Time III'].fillna(z).apply(lt_in_seconds)

tsukuba_timed = (
    tsukuba.loc[tsukuba['Lap Time I'] > 0]
)

tsukuba_ac = tsukuba.loc[tsukuba['Source']!='irl']

tsukuba_all_irl = (
    tsukuba.loc[(tsukuba['Source']=='irl')]
    .drop(columns=['Author']))

tsukuba_irl = (
    tsukuba.loc[(tsukuba['Source']=='irl')&
                (tsukuba["compID"]==0)]
    .drop(columns=['Author']))

tsukuba_test_set = tsukuba.loc[tsukuba["compID"]>0]

tsukuba_test_set_irl = tsukuba_test_set.loc[tsukuba['Source']=='irl']

In [None]:
txt = " ({numcar} cars)"

print("Datasets:\n")
print("All Tsukuba Data"+txt.format(numcar = tsukuba.shape[0]))
display(tsukuba.head())
print("\nTimed  Data Only"+txt.format(numcar = tsukuba_timed.shape[0]))
display(tsukuba_timed.head())
print("\nAssetto Corsa Data"+txt.format(numcar = tsukuba_ac.shape[0]))
display(tsukuba_ac.head())
print("\nAll Real Life Data"+txt.format(numcar = tsukuba_all_irl.shape[0]))
display(tsukuba_all_irl.head())
print("\nReal Life Only Data"+txt.format(numcar = tsukuba_irl.shape[0]))
display(tsukuba_irl.head())
print("\nTest Set (Direct Comparisons)"+txt.format(numcar = tsukuba_test_set.shape[0]))
display(tsukuba_test_set.head())
print("\nTest Set (IRL)"+txt.format(numcar = tsukuba_test_set_irl.shape[0]))
display(tsukuba_test_set_irl.head())


In [None]:
assert tsukuba_ac.shape[0] + tsukuba_irl.shape[0] == tsukuba.shape[0]

In [None]:
f, axes = plt.subplots(5, 1, figsize=(35,25),)

z = sns.histplot(
    data=tsukuba,
    x='WHP',
    ax=axes[0],
    color="purple",
    kde=True,
    ).set(title="Distribution of Wheel Horsepower")

z2 = sns.histplot(
    data=tsukuba,
    x='WTQ',
    ax=axes[1],
    color="grey",
    kde=True,
    ).set(title="Distribution of Torque (NM)")

z1 = sns.histplot(
    data=tsukuba,
    x='Weight',
    ax=axes[2],
    color="black",
    kde=True,
    ).set(title="Distribution of Weight (kg)")

y = sns.histplot(
    data=tsukuba,
    x='P/W (hp/t)',
    binwidth=25,
    ax=axes[3],
    color="red",
    kde=True,
    ).set(title="Distribution of Power to Weight Ratios (hp/t)")

y = sns.histplot(
    data=tsukuba_timed,
    x='Lap Time I',
    ax=axes[4],
    color="red",
    kde=True,
    ).set(title="Distribution of Lap Times (S)")

In [None]:
cols = [
'Year',
'MSRP (2022 USD)',
 'Lap Time I',
 'WHP',
 'WTQ',
 'Weight',
 'P/W (hp/t)',]
subsetdf = tsukuba_timed[cols]
sns.pairplot(data = subsetdf)

In [None]:
tsukuba_brand_mu = pd.DataFrame(tsukuba_timed.groupby('Brand').mean())
tsukuba_brand_med = pd.DataFrame(tsukuba_timed.groupby('Brand').median())
display(tsukuba_brand_mu.head())
display(tsukuba_brand_med.head())

In [None]:
subsetdf = subsetdf.assign(pwr= subsetdf['P/W (hp/t)'])
subsetdf = subsetdf.assign(lt= subsetdf['Lap Time I'])
subsetdf['pwr'] = (
    (subsetdf['pwr'] - subsetdf['pwr'].mean())/subsetdf['pwr'].std()
    )
subsetdf['lt'] = (
    (subsetdf['lt'] - subsetdf['lt'].mean())/subsetdf['lt'].std()
    )

pwr = subsetdf['pwr']
lt = subsetdf['lt']
outcome, predictors = patsy.dmatrices('pwr ~ lt', subsetdf)
mod = sm.OLS(outcome, predictors)
res = mod.fit()
display(res.summary())

In [None]:
tsukuba.Year.plot(kind="hist", bins=50)

## Baseline Modelling
### Linear Regression

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Select the specified columns for training and testing
columns = ['Year', 'Brand', 'RC I', 'Country', 'MSRP (2022 USD)', 'WHP', 'WTQ', 'Weight', 'P/W (hp/t)', 'Layout', 'Trans', 'DR', 'Lap Time I']
train_data = tsukuba_all_irl[columns].dropna()
test_data = tsukuba_test_set_irl[columns].dropna()

# Split the data into features and target
X_train = train_data.drop('Lap Time I', axis=1)
y_train = train_data['Lap Time I']
X_test = test_data.drop('Lap Time I', axis=1)
y_test = test_data['Lap Time I']

# Preprocessing steps for numerical and categorical features
numeric_features = ['Year', 'MSRP (2022 USD)', 'WHP', 'WTQ', 'Weight', 'P/W (hp/t)']
categorical_features = ['Brand', 'Country', 'Layout', 'Trans', 'DR']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the linear regression model
linear_regression_model_all = LinearRegression()

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', linear_regression_model_all)
])

# Train the linear regression model on all real-life cars
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate RMSE for the predictions
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE for LReg model trained on all real life data including cars in the test set:", rmse)

In [None]:
tsukuba_all_irl.isnull().sum()