In [48]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from pathlib import Path
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_curve, roc_auc_score,log_loss, classification_report
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [2]:
np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.precision', 4)

In [8]:
# Try to download the file from a public URL.
# If this is not the correct URL, please provide the correct one or upload the file.
!wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv -O /content/loan_toy_simple.csv

--2025-09-03 09:15:27--  https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘/content/loan_toy_simple.csv’


2025-09-03 09:15:27 (6.65 MB/s) - ‘/content/loan_toy_simple.csv’ saved [60302/60302]



In [15]:
from pathlib import Path
ng = np.random.default_rng(7)
dataset = pd.read_csv('/content/loan_toy_simple.csv')
dataset.head()

Unnamed: 0,salary_lakhs,loan_lakhs,approve
0,12.0015,1.2712,1
1,16.3554,12.9408,1
2,14.411,16.0674,1
3,5.6033,10.7471,0
4,6.8027,14.7911,0


In [24]:
feature_columns = ['salary_lakhs', 'loan_lakhs']
x = dataset[feature_columns]
y = dataset['approve']
x, y = shuffle(x, y, random_state=42)

In [28]:
feature_columns = ['salary_lakhs', 'loan_lakhs']
features = dataset[feature_columns]
labels = dataset['approve']


In [29]:
features, labels = shuffle(features, labels, random_state=42)
train_features,test_features,train_labels,test_labels=train_test_split(features,labels,test_size=0.2,random_state=42)

In [30]:
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [42]:
log_reg=LogisticRegression(max_iter=1000,random_state=42)

log_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()), ("linear_regression", LogisticRegression())])
log_reg_cv_results = cross_validate(log_reg_pipeline, train_features, train_labels, cv=shuffle_split_cv, scoring="neg_mean_absolute_error")
log_reg_errors= pd.Series(-log_reg_cv_results['test_score'], name="lin_reg_errors")

log_reg.fit(train_features, train_labels)
train_accuracy = accuracy_score(train_labels, log_reg.predict(train_features))
test_accuracy = accuracy_score(test_labels, log_reg.predict(test_features))

labels_pred=log_reg.predict(test_features)

find_accu= accuracy_score(test_labels,labels_pred)
find_prec= precision_score(test_labels,labels_pred)
find_recall= recall_score(test_labels,labels_pred)
find_f1= f1_score(test_labels,labels_pred)
print("accuracy",find_accu)
print("precision",find_prec)
print("recall",find_recall)
print("f1",find_f1)
print("")

print("trained accuracy",train_accuracy)
print("test accuracy",test_accuracy)
print("")
if train_accuracy>test_accuracy+0.05:
  print("Overfitting")
elif train_accuracy<0.7 and test_accuracy<0.7:
  print("underfitting")
else:
  print("model is good")

accuracy 1.0
precision 1.0
recall 1.0
f1 1.0

trained accuracy 0.9479166666666666
test accuracy 1.0

model is good


In [50]:
dataset = load_iris()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name='target')
target_names = dataset.target_names
feature_names = dataset.feature_names

In [61]:
y_df=y.to_frame(name='target')
y_df['target_0']=(y_df['target']==0).astype(int)
y_df['target_1']=(y_df['target']==0).astype(int)

shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

features,labels=load_iris(as_frame=True,return_X_y=True)
train_features,test_features,train_labels,test_labels=train_test_split(features,labels,test_size=0.2,random_state=42)

log_reg=LogisticRegression(max_iter=1000,random_state=42)

log_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()), ("linear_regression", LogisticRegression())])
log_reg_cv_results = cross_validate(log_reg_pipeline, train_features, train_labels, cv=shuffle_split_cv, scoring="neg_mean_absolute_error")
log_reg_errors= pd.Series(-log_reg_cv_results['test_score'], name="lin_reg_errors")

log_reg.fit(train_features, train_labels)
train_accuracy = accuracy_score(train_labels, log_reg.predict(train_features))
test_accuracy = accuracy_score(test_labels, log_reg.predict(test_features))

labels_pred=log_reg.predict(test_features)

find_accu= accuracy_score(test_labels,labels_pred)
find_prec= precision_score(test_labels,labels_pred, average='weighted')
find_recall= recall_score(test_labels,labels_pred, average='weighted')
find_f1= f1_score(test_labels,labels_pred, average='weighted')
cm= confusion_matrix(test_labels,labels_pred)
print("accuracy",find_accu)
print("precision",find_prec)
print("recall",find_recall)
print("f1",find_f1)
print("confusion_matrix",cm)
print(classification_report(test_labels,labels_pred, target_names=target_names))
print("")

print("trained accuracy",train_accuracy)
print("test accuracy",test_accuracy)
print("")
if train_accuracy>test_accuracy+0.05:
  print("Overfitting")
elif train_accuracy<0.7 and test_accuracy<0.7:
  print("underfitting")
else:
  print("model is good")


accuracy 1.0
precision 1.0
recall 1.0
f1 1.0
confusion_matrix [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


trained accuracy 0.975
test accuracy 1.0

model is good
