In [3]:
#simple example of a machine learning pipeline using the Naïve Bayes classification algorithm 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import numpy as np
# Generate dummy dataset
X = np.random.rand(100, 5) # 100 samples, 5 features
y = np.random.randint(0, 2, 100) # Binary target variable (0 or 1)

# Splitting the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Create a Naïve Bayes model
model = GaussianNB()
# Train the model on training data
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.50


In [None]:
#K-Fold Cross Validation 
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import seaborn as sns
# Load dataset
df = sns.load_dataset("titanic")
# Select features and target, handling missing values
X = df[['age', 'fare']].fillna(df[['age', 'fare']].mean())
y = df['survived']
# Convert to DataFrame to use .iloc[]
X = pd.DataFrame(X)
y = pd.Series(y)
# Define K-Fold (5 splits)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize model
model = LogisticRegression()
# Store accuracy scores
accuracy_scores = []
# Perform K-Fold CV
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] # Now X is a DataFrame
    y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Now y is a Series

     # Train model
    model.fit(X_train, y_train)

 # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)
# Print average accuracy
print("K-Fold CV Average Accuracy:", np.mean(accuracy_scores))

K-Fold CV Average Accuracy: 0.6498336576486097


In [5]:
from sklearn.model_selection import LeaveOneOut
# Initialize LOOCV
loo = LeaveOneOut()

# Store accuracy scores
loo_scores = []
# Perform LOOCV
for train_index, test_index in loo.split(X):
 X_train, X_test = X.iloc[train_index], X.iloc[test_index]
 y_train, y_test = y.iloc[train_index], y.iloc[test_index]

 # Train model
 model.fit(X_train, y_train)

 # Predict and evaluate
 y_pred = model.predict(X_test)
 loo_scores.append(accuracy_score(y_test, y_pred))
# Print average accuracy
print("LOOCV Average Accuracy:", np.mean(loo_scores))

LOOCV Average Accuracy: 0.6565656565656566


In [None]:
# Supervised Learning (Logistic Regression)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Input data (features): Age and Income
X = [[25, 50000], [30, 60000], [22, 45000], [35, 85000], [40, 100000], [28, 58000]]

# Output data (labels): 0 = No, 1 = Yes
y = [0, 1, 0, 1, 1, 1]

# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create model
model = LogisticRegression()

# Train model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Try predicting new example
new_person = [[27, 55000]]
prediction = model.predict(new_person)
print("Will this person buy the product?", "Yes" if prediction[0] == 1 else "No")


Accuracy: 1.0
Will this person buy the product? Yes


In [21]:
# 4. Classification
# Support Vector Machine

from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target
y = (y == 0).astype(int) # Convert to binary classification problem
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)
# Train SVM model with RBF kernel
svm = SVC(kernel='rbf', C=1, gamma='scale')
svm.fit(X_train, y_train)
# Make predictions
y_pred = svm.predict(X_test)
# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred))

SVM Accuracy: 1.0


In [None]:
# IMPLEMENT LINEAR REGRESSION

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error ,r2_score

# Load dataset
iris = datasets.load_iris()
X = iris.data[:, 2].reshape(-1, 1)  # Use petal length as the feature
y = iris.data[:, 3]  # Use petal width as the target

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the data being sent to the model
print("=== Data Sent to Linear Regression Model ===")
print("x_train shape:", x_train.shape)
print("x_train sample:\n", x_train[:5])
print("y_train shape:", y_train.shape)
print("y_train sample:", y_train[:5])
print("============================================")

# Create and train the Linear Regression model
LR = LinearRegression()
ModelLR = LR.fit(x_train, y_train)

# Predict on the test data
PredictionLR = ModelLR.predict(x_test)

# Print the predictions
print("Predictions:", PredictionLR)
# Evaluation metrics
mse = mean_squared_error(y_test, PredictionLR)
r2 = r2_score(y_test, PredictionLR)

print("Mean Squared Error:", mse)
print("R² Score (Accuracy-like):", r2)

# How to interpret R²:
# 1.0 = perfect prediction

# 0.0 = model predicts no better than the mean

# < 0 = model is worse than just guessing the mean

=== Data Sent to Linear Regression Model ===
x_train shape: (120, 1)
x_train sample:
 [[1. ]
 [1.5]
 [4.4]
 [1.6]
 [1.3]]
y_train shape: (120,)
y_train sample: [0.2 0.4 1.4 0.2 0.2]
Predictions: [1.58555194 0.34583706 2.49467619 1.50290428 1.62687577 0.2631894
 1.13098982 1.75084726 1.50290428 1.2549613  1.75084726 0.22186557
 0.18054174 0.2631894  0.2631894  1.58555194 2.04011406 1.2549613
 1.50290428 1.9574664  0.30451323 1.6681996  0.30451323 1.9574664
 2.28805704 1.79217109 2.04011406 2.08143789 0.22186557 0.30451323]
Mean Squared Error: 0.045604284097661846
R² Score (Accuracy-like): 0.9282562958836972
