# Training the first model (Part 2)

<img src="images\pen2.jpg" width="300" height="300">

In [1]:
# Import packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import pandas as pd
import numpy as np
import sqlite3
import os
import joblib

## Define the first model

In [2]:
# Model parameters:
model_version = 1
model_id = int(model_version) + 100 
max_iter=30  # I used this to lower the model's performance
random_state=42
data_size = 165
scaled = False # I used this to lower the model's performance and have difference to other models
method = 'LogisticRegression'

# data_size is subjectively defined to be around half of dataset

### We set new statuses in STATUS table

In [3]:
# Connect to database and add two new statuses
conn = sqlite3.connect('db_penguins.db')
cursor = conn.cursor()
conn.execute(f"INSERT INTO STATUS (status_id, status_type) VALUES ({model_version}, 'M{model_version}_train')")
conn.execute(f"INSERT INTO STATUS (status_id, status_type) VALUES ({model_version +1}, 'M{model_version}_test')")
conn.commit()

### We query random data from database

In [4]:
# Select random ? datapoints from the PENGUINS table
query = f"SELECT * FROM PENGUINS ORDER BY RANDOM() LIMIT {data_size}"
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

In [5]:
# Encode the 'species' column to numerical values
label_encoder = LabelEncoder()
df['species'] = label_encoder.fit_transform(df['species'])

# Selecting features and target variable
# We include animal_id to track datapoints
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'animal_id']]
y = df['species']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# After splitting into X_train and X_test, get the 'animal_id' for each
train_ids = X_train['animal_id'].tolist()
test_ids = X_test['animal_id'].tolist()

In [6]:
# Convert train_ids and test_ids to a comma-separated string for storage in db table MODEL
fv_train = ','.join(map(str, train_ids))
fv_test = ','.join(map(str, test_ids))

In [7]:
# Update the database with the status for each animal_id
update_train_status = f"UPDATE PENGUINS SET status_id = {model_version} WHERE animal_id = ?"
update_test_status = f"UPDATE PENGUINS SET status_id = {model_version + 1} WHERE animal_id = ?"

conn = sqlite3.connect('db_penguins.db')
cursor = conn.cursor()

# Execute the updates
for aid in train_ids:
    conn.execute(update_train_status, (aid,))
for aid in test_ids:
    conn.execute(update_test_status, (aid,))

# Commit the changes and close the database connection
conn.commit()
conn.close()

In [8]:
# we will not use Scaling in the first model

# remove 'animal_id' from X_train and X_test
X_train = X_train.drop(['animal_id'], axis=1)
X_test = X_test.drop(['animal_id'], axis=1)

# Feature Scaling
scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

X_train_scaled = X_train
X_test_scaled = X_test

In [9]:
# Train a Logistic Regression Model
model = LogisticRegression(max_iter=max_iter, random_state=random_state)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.72
Confusion Matrix:
[[18  0  3]
 [ 7  4  0]
 [ 4  0 14]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Save model to local directory
model_directory = "/models"
model_filename = f"model_v{model_version}.joblib"
model_path = f".{model_directory}/{model_filename}"
joblib.dump(model, model_path)

['./models/model_v1.joblib']

In [11]:
# Write model data to sql
conn = sqlite3.connect('db_penguins.db')
cursor = conn.cursor()
# Insert model data into MODEL table including fv_train and fv_test
sql_insert_model = """
INSERT INTO MODEL (
    model_id,
    model_version,
    model_method,
    model_size,
    model_scaled,
    model_accuracy,
    fv_train,
    fv_test
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
"""
cursor.execute(sql_insert_model, (model_id, model_version, method, data_size, int(scaled), accuracy, fv_train, fv_test))

conn.commit()
conn.close()

