# Setup

In [None]:
# Setup

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, roc_curve, auc
from sklearn.datasets import fetch_california_housing, load_iris, load_breast_cancer
from sklearn.preprocessing import StandardScaler

# Load the California Housing dataset
california = fetch_california_housing()
df_california = pd.DataFrame(california.data, columns=california.feature_names)
df_california['MedHouseVal'] = california.target

# Load the Iris dataset
iris = load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris['species'] = iris.target

# Load the Titanic dataset
titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df_titanic = pd.read_csv(titanic_url)

# Display the first few rows of each dataset
df_california.head(), df_iris.head(), df_titanic.head()


# Advanced Pandas Operations

### Grouping and Aggregating

In [None]:
# Grouping and Aggregating (Instruction)
# Group data by 'AveRooms' and calculate the mean of 'MedHouseVal' and 'AveOccup'
grouped = df_california.groupby('AveRooms').agg({'MedHouseVal': 'mean', 'AveOccup': 'mean'})
print("Grouped and Aggregated Data:\n", grouped)

# Grouping and Aggregating (Participant - on your own)
# Group data by 'HouseAge' and calculate the mean of 'MedHouseVal' and 'AveRooms'
# Instructions: Group the dataset by 'HouseAge' and calculate the mean of 'MedHouseVal' and 'AveRooms'.
# Use the .groupby() method and .agg() function similar to the previous example.
grouped_house_age = df_california.groupby('HouseAge').agg({'MedHouseVal': 'mean', 'AveRooms': 'mean'})
print("Grouped by HouseAge and Aggregated Data:\n", grouped_house_age)


### Merging DataFrames

In [None]:
# Merging DataFrames (Instruction)
# Create a hypothetical additional_info_df
additional_info_df = pd.DataFrame({
    'AveRooms': [2, 4, 6, 8],
    'additional_info': ['info1', 'info2', 'info3', 'info4']
})
merged_df = pd.merge(df_california, additional_info_df, on='AveRooms', how='left')
print("Merged DataFrame:\n", merged_df.head())

# Merging DataFrames (Participant - on your own)
# Create another hypothetical additional_info_df
additional_info_df_participant = pd.DataFrame({
    'HouseAge': [10, 20, 30, 40],
    'additional_info': ['infoA', 'infoB', 'infoC', 'infoD']
})
merged_df_participant = pd.merge(df_california, additional_info_df_participant, on='HouseAge', how='left')
print("Merged DataFrame (Participant):\n", merged_df_participant.head())


### Pivot Tables

In [None]:
# Using Pivot Tables (Instruction)
# Create a pivot table with 'HouseAge' as index, 'AveRooms' as columns, and mean of 'MedHouseVal' as values
pivot_table = pd.pivot_table(df_california, values='MedHouseVal', index='HouseAge', columns='AveRooms', aggfunc='mean')
print("Pivot Table:\n", pivot_table)

# Using Pivot Tables (Participant - on your own)
# Create a pivot table with 'AveRooms' as index, 'HouseAge' as columns, and mean of 'MedHouseVal' as values
pivot_table_participant = pd.pivot_table(df_california, values='MedHouseVal', index='AveRooms', columns='HouseAge', aggfunc='mean')
print("Pivot Table (Participant):\n", pivot_table_participant)


# Introduction to Machine Learning

### Loading and expecting the data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
import matplotlib.pyplot as plt

# Load the California Housing dataset
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='MedHouseVal')

# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# visualize missingness

import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'png'

msno.matrix(X_train)
plt.show()

In [None]:
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Plot the results
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

In [None]:
# Linear Regression (Participant - on your own)
# Instructions: Train a linear regression model using a subset of features such as 'MedInc' and 'HouseAge'.
X_participant = X[['MedInc', 'HouseAge']]
y_participant = y
X_train_participant, X_test_participant, y_train_participant, y_test_participant = train_test_split(X_participant, y_participant, test_size=0.2, random_state=42)
model_participant = LinearRegression()
model_participant.fit(X_train_participant, y_train_participant)
y_pred_participant = model_participant.predict(X_test_participant)
mse_participant = mean_squared_error(y_test_participant, y_pred_participant)
r2_participant = r2_score(y_test_participant, y_pred_participant)
print(f'Mean Squared Error (Participant): {mse_participant}')
print(f'R^2 Score (Participant): {r2_participant}')

### PCA

In [None]:
#import the breast _cancer dataset
from sklearn.datasets import load_breast_cancer
data=load_breast_cancer()
data.keys()

# Check the output classes
print(data['target_names'])

# Check the input attributes
print(data['feature_names'])

In [None]:
# construct a dataframe using pandas
df1=pd.DataFrame(data['data'],columns=data['feature_names'])

# Scale data before applying PCA
scaling=StandardScaler()

# Use fit and transform method
scaling.fit(df1)
Scaled_data=scaling.transform(df1)

# Set the n_components=3
principal=PCA(n_components=3)
principal.fit(Scaled_data)
x=principal.transform(Scaled_data)

# Check the dimensions of data after PCA
print(x.shape)

In [None]:
principal.components_

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(x[:,0],x[:,1],c=data['target'],cmap='plasma')
plt.xlabel('pc1')
plt.ylabel('pc2')

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))

# choose projection 3d for creating a 3d graph
axis = fig.add_subplot(111, projection='3d')

# x[:,0]is pc1,x[:,1] is pc2 while x[:,2] is pc3
axis.scatter(x[:,0],x[:,1],x[:,2], c=data['target'],cmap='plasma')
axis.set_xlabel("PC1", fontsize=10)
axis.set_ylabel("PC2", fontsize=10)
axis.set_zlabel("PC3", fontsize=10)

In [None]:
# check how much variance is explained by each principal component
print(principal.explained_variance_ratio_)

### Clustering (K-means)

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d  # noqa: F401
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans

In [None]:
# The iris dataset has various measures related to n species of irises.
# Without checking how many different species are present, can we run a
# clustering algorithm to sort and differentiate these species of flowers?

iris = datasets.load_iris()
X = iris.data
y = iris.target

# K-means clustering is one way to do this (unsupervised ML)
n = 5 # how many clusters you expect
kmeans = KMeans(n_clusters=n, random_state=42) # make a model class
kmeans.fit(X) # fit the model

# pull the results from the model

labels = kmeans.labels_ # cluster labels

# plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1, projection="3d", elev=48, azim=134)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor="k")
ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])
ax.set_xlabel("Petal width")
ax.set_ylabel("Sepal length")
ax.set_zlabel("Petal length")

In [None]:
# A more principled way to select the number of clusters:

# "Inertia" is the sum of squared distance of samples to their closest cluster center.
# A good model is one with low inertia AND a low number of clusters ( K ).
# However, this is a tradeoff because as K increases, inertia decreases.
# To find the optimal K for a dataset, use the Elbow method; find the point
# where the decrease in inertia begins to slow. K=3 is the “elbow” of this graph.

# On your own: finish the for loop below to plot inertia over n_clusters,
# and determine the optimal number of clusters

inertia = []
for n in range(1, 11):
    kmeans = KMeans(n_clusters=n, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

In [None]:
# Compare k-means output to ground truth

np.random.seed(5)

iris = datasets.load_iris()
X = iris.data
y = iris.target

estimators = [
    ("k_means_iris_8", KMeans(n_clusters=8)),
    ("k_means_iris_3", KMeans(n_clusters=3)),
    ("k_means_iris_bad_init", KMeans(n_clusters=3, n_init=1, init="random")),
]

fig = plt.figure(figsize=(10, 8))
titles = ["8 clusters", "3 clusters", "3 clusters, bad initialization"]
for idx, ((name, est), title) in enumerate(zip(estimators, titles)):
    ax = fig.add_subplot(2, 2, idx + 1, projection="3d", elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(float), edgecolor="k")

    ax.xaxis.set_ticklabels([])
    ax.yaxis.set_ticklabels([])
    ax.zaxis.set_ticklabels([])
    ax.set_xlabel("Petal width")
    ax.set_ylabel("Sepal length")
    ax.set_zlabel("Petal length")
    ax.set_title(title)

# Plot the ground truth
ax = fig.add_subplot(2, 2, 4, projection="3d", elev=48, azim=134)

for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
    ax.text3D(
        X[y == label, 3].mean(),
        X[y == label, 0].mean(),
        X[y == label, 2].mean() + 2,
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.2, edgecolor="w", facecolor="w"),
    )

ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor="k")

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])
ax.set_xlabel("Petal width")
ax.set_ylabel("Sepal length")
ax.set_zlabel("Petal length")
ax.set_title("Ground Truth")

plt.subplots_adjust(wspace=0.25, hspace=0.25)
plt.show()

# Deeper Dive into ML

### Logistic Regression with Titanic Dataset

In [None]:
titanic

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Function to plot confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')
titanic = titanic.dropna(subset=['age', 'fare', 'embarked'])

# Prepare the data
X = pd.get_dummies(titanic[['age', 'fare', 'sex', 'embarked', 'class', 'who', 'alone']], drop_first=True)
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = logreg.predict(X_test_scaled)
y_prob = logreg.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{cm}')

# Plot ROC Curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Plot the confusion matrix
plt.figure()
plot_confusion_matrix(cm, classes=['Not Survived', 'Survived'],
                      title='Confusion Matrix - Initial Model')



In [None]:

# Improving the model with feature engineering and hyperparameter tuning
# Adding interaction terms
X['age_fare'] = X['age'] * X['fare']

# Prepare the data again with the new feature
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use GridSearchCV to find the best hyperparameters
param_grid = {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}
grid_search = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model from GridSearchCV
best_logreg = grid_search.best_estimator_
y_pred_best = best_logreg.predict(X_test_scaled)
y_prob_best = best_logreg.predict_proba(X_test_scaled)[:, 1]

# Evaluate the improved model
accuracy_best = accuracy_score(y_test, y_pred_best)
cm_best = confusion_matrix(y_test, y_pred_best)
fpr_best, tpr_best, _ = roc_curve(y_test, y_prob_best)
roc_auc_best = auc(fpr_best, tpr_best)

print(f'Improved Accuracy: {accuracy_best}')
print(f'Improved Confusion Matrix:\n{cm_best}')

# Plot ROC Curve for the improved model
plt.figure()
plt.plot(fpr_best, tpr_best, color='darkorange', lw=2, label=f'Improved ROC curve (area = {roc_auc_best:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Improved)')
plt.legend(loc='lower right')
plt.show()

# Plot the confusion matrix for the improved model
plt.figure()
plot_confusion_matrix(cm_best, classes=['Not Survived', 'Survived'],
                      title='Confusion Matrix - Improved Model')
plt.show()