In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
from PIL import Image
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression

get_ipython().run_line_magic('matplotlib', 'inline')


In [None]:
# Load data
df = pd.read_csv("measurements.csv")
df01 = pd.read_excel("measurements2.xlsx")

In [None]:
# Exploratory data analysis / Cleaning data
df.info()
column_value_counts = df.count()

In [None]:
columns_to_convert = ["distance", "consume", "temp_inside"]
for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce', downcast='integer')



In [None]:
columns_to_delete = ["specials", "refill liters", "refill gas"]
df = df.drop(columns=columns_to_delete)

column_means = df.mean()
df.fillna(column_means, inplace=True)



In [None]:
# Data analysis and modeling preparation
numerical_df = df.select_dtypes(include=[np.number])
categorical_df = df.select_dtypes(include=['object'])



In [None]:
dummy_nominals = ["gas_type"]
categorical_df = pd.get_dummies(categorical_df, columns=dummy_nominals)



In [None]:
df_model = pd.concat([numerical_df, categorical_df], axis=1)

X = df_model[['distance', 'speed', 'temp_inside', 'temp_outside', 'AC',
              'rain', 'sun', 'gas_type_E10', 'gas_type_SP98']]
y = df_model['consume']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Linear Regression Model
scaler = StandardScaler()
X_scaler_train = scaler.fit_transform(X_train)
X_scaler_test = scaler.transform(X_test)

model_lr = LinearRegression()
model_lr.fit(X_scaler_train, y_train)

y_predictive = model_lr.predict(X_scaler_test)

mse = mean_squared_error(y_test, y_predictive)
r2 = r2_score(y_test, y_predictive)



In [None]:
# Coefficients Analysis
coefficients = model_lr.coef_
coefficients_analysis = pd.DataFrame({
    "independent variables": ['distance', 'speed', 'temp_inside', 'temp_outside', 'AC',
                              'rain', 'sun', 'gas_type_E10', 'gas_type_SP98'],
    'coefficients': coefficients
})

df_orden_importances = coefficients_analysis.sort_values(by='coefficients', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='coefficients', y='independent variables', data=df_orden_importances)
plt.xlabel('coefficients')
plt.ylabel('Independent Variables')
plt.title('Importance of variables in the model')
plt.show()


In [None]:
# Logistic Regression Model
categorical_df_lr = df.select_dtypes(include=['object'])
df_model_lr = pd.concat([numerical_df, categorical_df_lr], axis=1)

X_lr = df_model_lr[['distance', 'speed', 'temp_inside', 'temp_outside', 'AC', 'rain', 'sun', 'consume']]
y_lr = df_model_lr['gas_type']

X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_scaler_train_lr = scaler.fit_transform(X_train_lr)
X_scaler_test_lr = scaler.transform(X_test_lr)



In [None]:
# Unbalanced Logistic Regression Model
model_unbalanced = LogisticRegression()
model_unbalanced.fit(X_scaler_train_lr, y_train_lr)

y_predictive_lr = model_unbalanced.predict(X_scaler_test_lr)

accuracy = accuracy_score(y_test_lr, y_predictive_lr)
print("Model Accuracy (Unbalanced): {:.2f}%".format(accuracy * 100))


In [None]:
# Balanced Logistic Regression Model using SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote_train, y_smote_train = smote.fit_resample(X_scaler_train_lr, y_train_lr)

model_balanced = LogisticRegression()
model_balanced.fit(X_smote_train, y_smote_train)

y_predictive_using_smote = model_balanced.predict(X_scaler_test_lr)

accuracy = accuracy_score(y_test_lr, y_predictive_using_smote)
print("Model Accuracy (Balanced with SMOTE): {:.2f}%".format(accuracy * 100))


In [None]:
# Classification Report
report = classification_report(y_test_lr, y_predictive_using_smote)
print(report)


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test_lr, y_predictive_using_smote, labels=model_balanced.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model_balanced.classes_)
disp.plot()
plt.show()
