In [2]:
import pandas as pd

# Replace 'your_data_file.csv' with the actual path to your CSV file
file_path = 'https://storage.googleapis.com/rg-ai-bootcamp/machine-learning/StudentsPerformance.csv'

# Load the CSV data into a pandas DataFrame
data = pd.read_csv(file_path)

numerical_data = data[['math score', 'reading score', 'writing score']]
numerical_data

Unnamed: 0,math score,reading score,writing score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75
...,...,...,...
995,88,99,95
996,62,55,55
997,59,71,65
998,68,78,77


In [3]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a subplot with axis labels
fig = make_subplots(rows=1, cols=3, subplot_titles=( "Reading vs Writing", "Math vs Reading", "Math vs Writing"))

# Reading vs Writing
fig.add_trace(go.Scatter(x=data['reading score'], y=data['writing score'], mode='markers', name='Reading vs Writing',
                         hovertemplate='Reading Score: %{x:.2f}<br>Writing Score: %{y:.2f}<extra></extra>'), row=1, col=1)
fig.update_xaxes(title_text="Reading Score", row=1, col=1)
fig.update_yaxes(title_text="Writing Score", row=1, col=1)

# Add scatter plots with axis labels
# Math vs Reading
fig.add_trace(go.Scatter(x=data['math score'], y=data['reading score'], mode='markers', name='Math vs Reading',
                         hovertemplate='Math Score: %{x:.2f}<br>Reading Score: %{y:.2f}<extra></extra>'), row=1, col=2)
fig.update_xaxes(title_text="Math Score", row=1, col=2)
fig.update_yaxes(title_text="Reading Score", row=1, col=2)

# Math vs Writing
fig.add_trace(go.Scatter(x=data['math score'], y=data['writing score'], mode='markers', name='Math vs Writing',
                         hovertemplate='Math Score: %{x:.2f}<br>Writing Score: %{y:.2f}<extra></extra>'), row=1, col=3)
fig.update_xaxes(title_text="Math Score", row=1, col=3)
fig.update_yaxes(title_text="Writing Score", row=1, col=3)

# Update layout
fig.update_layout(height=500, width=1500, title_text="Students' Performance Comparisons")
fig.show()

In [4]:
cov_reading_writing = data['reading score'].cov(data['writing score'])
print("Covariance between Reading Score and Writing Score:", cov_reading_writing)

cov_math_writing = data['math score'].cov(data['writing score'])
print("Covariance between Math Score and Writing Score:", cov_math_writing)

cov_math_reading = data['math score'].cov(data['reading score'])
print("Covariance between Math Score and Reading Score:", cov_math_reading)

Covariance between Reading Score and Writing Score: 211.78666066066071
Covariance between Math Score and Writing Score: 184.93913313313314
Covariance between Math Score and Reading Score: 180.99895795795805


In [5]:
import plotly.express as px
from sklearn.decomposition import PCA

# Performing PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(numerical_data)

# Creating a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

# Adding original scores as hover_data
pca_df['Math Score'] = data['math score']
pca_df['Reading Score'] = data['reading score']
pca_df['Writing Score'] = data['writing score']

# Create a scatter plot using Plotly
fig = px.scatter(
    pca_df, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
    title='PCA of Student Performance',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'}
)

# Show the plot
fig.show()

In [7]:
import plotly.express as px
import plotly.subplots as sp


# Adding original scores as hover_data
pca_df['Math Score'] = data['math score']
pca_df['Reading Score'] = data['reading score']
pca_df['Writing Score'] = data['writing score']

# Create scatter plots using Plotly
fig1 = px.scatter(
    pca_df, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
    title='PCA of Student Performance (Math Score)',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'},
    color='Math Score',
    color_continuous_scale='rainbow'
)

fig2 = px.scatter(
    pca_df, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
    title='PCA of Student Performance (Writing Score)',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'},
    color='Writing Score',
    color_continuous_scale='rainbow'
)

fig3 = px.scatter(
    pca_df, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
    title='PCA of Student Performance (Reading Score)',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'},
    color='Reading Score',
    color_continuous_scale='rainbow'
)

# Create subplots horizontally
fig = sp.make_subplots(rows=1, cols=3, shared_xaxes=False, shared_yaxes=False, horizontal_spacing=0.1)

# Add traces to the subplots
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=1, col=3)

# Add labels at the top of each plot using annotations
fig.add_annotation(
    text='Math Score',
    xref='paper', yref='paper',
    x=0.07, y=1.15,
    showarrow=False,
    font=dict(size=14)
)

fig.add_annotation(
    text='Writing Score',
    xref='paper', yref='paper',
    x=0.5, y=1.15,
    showarrow=False,
    font=dict(size=14)
)

fig.add_annotation(
    text='Reading Score',
    xref='paper', yref='paper',
    x=0.9, y=1.15,
    showarrow=False,
    font=dict(size=14)
)

# Update layout for the overall figure
fig.update_layout(
    title='PCA of Student Performance',
    xaxis=dict(title='Principal Component 1 (PC1)'),
    yaxis=dict(title='Principal Component 2 (PC2)'),
    showlegend=False,
)

# Show the horizontal subplot
fig.show()

In [8]:
from sklearn.cluster import KMeans
import plotly.express as px

# Perform k-means clustering on the PCA-transformed data
kmeans = KMeans(n_clusters=22, random_state=0)
pca_df['Cluster'] = kmeans.fit_predict(principal_components)

# Create a scatter plot for the k-means clustering results
fig4 = px.scatter(
    pca_df, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    color='Cluster',
    title=f'K-Means Clustering on PCA Components',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'},
    color_continuous_scale='rainbow',
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
)

# Show the k-means clustering plot
fig4.show()

In [9]:
import numpy as np

mean_math = np.mean(numerical_data['math score'])
mean_reading = np.mean(numerical_data['reading score'])
mean_writing = np.mean(numerical_data['writing score'])

print("Mean Math Score:", mean_math)
print("Mean Reading Score:", mean_reading)
print("Mean Writing Score:", mean_writing)

Mean Math Score: 66.089
Mean Reading Score: 69.169
Mean Writing Score: 68.054


In [10]:
math_score = 45
reading_score = 80
writing_score = 90

math_score_standardized = math_score - mean_math 
reading_score_standardized = reading_score - mean_reading
writing_score_standardized = writing_score - mean_writing

print("Standardized Math Score:", math_score_standardized)
print("Standardized Reading Score:", reading_score_standardized)
print("Standardized Writing Score:", writing_score_standardized)

Standardized Math Score: -21.089
Standardized Reading Score: 10.831000000000003
Standardized Writing Score: 21.945999999999998


In [11]:
# Access the PCA components (weights) for each column
pca_weights = pca.components_

# Create a DataFrame to display the PCA weights
pca_weights_df = pd.DataFrame(pca_weights, columns=numerical_data.columns, index=['PC1', 'PC2'])

# Display the PCA weights
print("PCA Weights for Each Column:")
print(pca_weights_df)

PCA Weights for Each Column:
     math score  reading score  writing score
PC1    0.562649       0.573977       0.594959
PC2    0.825612      -0.353292      -0.439943


In [12]:
pc1_calculation = pca_weights_df.loc["PC1", "math score"] * math_score_standardized + pca_weights_df.loc["PC1", "reading score"] * reading_score_standardized + pca_weights_df.loc["PC1", "writing score"] * writing_score_standardized
print("PC1 Calculation:", pc1_calculation)

pc2_calculation = pca_weights_df.loc["PC2", "math score"] * math_score_standardized + pca_weights_df.loc["PC2", "reading score"] * reading_score_standardized + pca_weights_df.loc["PC2", "writing score"] * writing_score_standardized
print("PC2 Calculation:", pc2_calculation)

PC1 Calculation: 7.408013160889753
PC2 Calculation: -30.892823442881628


In [13]:
from sklearn.cluster import KMeans
import plotly.express as px

# Perform k-means clustering on the PCA-transformed data
kmeans = KMeans(n_clusters=22, random_state=0)
pca_df['Cluster'] = kmeans.fit_predict(principal_components)
data = {'Principal Component 1': [-7.40801316088976],
        'Principal Component 2': [-30.892823442881664],
        'Math Score': [45],
        'Reading Score': [80],
        'Writing Score': [90],}

pca_df_new = pd.concat([pca_df, pd.DataFrame(data)], ignore_index=True)

# Create a scatter plot for the k-means clustering results
fig4 = px.scatter(
    pca_df_new, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    color='Cluster',
    title=f'K-Means Clustering on PCA Components',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'},
    color_continuous_scale='rainbow',
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
)

# Show the k-means clustering plot
fig4.show()

In [14]:
def centroids_to_dict(centroids):
    centroid_dict = {}
    for i, centroid in enumerate(centroids):
        centroid_key = f"Centroid {i+1}"
        centroid_dict[centroid_key] = centroid.tolist()
    return centroid_dict


centroid_dict = centroids_to_dict(kmeans.cluster_centers_)
print(centroid_dict)

{'Centroid 1': [-6.111869423478335, 0.5782285706191373], 'Centroid 2': [32.1043064113064, -4.9821296137813755], 'Centroid 3': [-42.78416893098516, -6.150108729601856], 'Centroid 4': [12.756769422326052, -0.0465799241151558], 'Centroid 5': [-28.65214737634982, -5.840273983082456], 'Centroid 6': [-31.44598276748619, 12.028329408863128], 'Centroid 7': [36.749663956087964, 8.17131363726072], 'Centroid 8': [48.79804809811358, -0.8156936544085672], 'Centroid 9': [-70.65323917931163, 6.054481835880037], 'Centroid 10': [-12.908756970134108, 9.231922544128457], 'Centroid 11': [17.103785897136405, -8.734003639351979], 'Centroid 12': [2.344206817682325, 7.232993274338539], 'Centroid 13': [-23.96464558941628, 4.975606919112984], 'Centroid 14': [5.056149546938331, -5.486785449476165], 'Centroid 15': [-3.0158897288885007, -9.038812859923393], 'Centroid 16': [15.730693145335964, 10.433783922924867], 'Centroid 17': [-50.65831755660182, 3.948210786948815], 'Centroid 18': [-17.048942864284495, -2.189679

In [15]:
import numpy as np

def find_nearest_centroid(x, y, centroids):
    # Create a point as a NumPy array
    point = np.array([x, y])
    
    # Calculate the Euclidean distance between the point and all centroids
    distances = np.linalg.norm(np.array(list(centroids.values())) - point, axis=1)
    
    # Find the index of the nearest centroid
    nearest_centroid_index = np.argmin(distances)
    
    return f"cluster {nearest_centroid_index}"

nearest_centroid = find_nearest_centroid(pc1_calculation, pc2_calculation, centroid_dict)
print(f"The cluster for ({pc1_calculation}, {pc2_calculation}) is {nearest_centroid}")

The cluster for (7.408013160889753, -30.892823442881628) is cluster 10


In [16]:
from sklearn.cluster import KMeans
import plotly.express as px

# Perform k-means clustering on the PCA-transformed data
kmeans = KMeans(n_clusters=22, random_state=0)
pca_df['Cluster'] = kmeans.fit_predict(principal_components)
data = {'Principal Component 1': [-8.488375],
        'Principal Component 2': [-30.892823442881664],
        'Math Score': [45],
        'Reading Score': [80],
        'Writing Score': [90],
        'Cluster': [14]
        }

pca_df_new = pd.concat([pca_df, pd.DataFrame(data)], ignore_index=True)

# Create a scatter plot for the k-means clustering results
fig4 = px.scatter(
    pca_df_new, 
    x='Principal Component 1', 
    y='Principal Component 2', 
    color='Cluster',
    title=f'K-Means Clustering on PCA Components',
    labels={'Principal Component 1': 'PC1', 'Principal Component 2': 'PC2'},
    color_continuous_scale='rainbow',
    hover_data=['Math Score', 'Reading Score', 'Writing Score'], 
)

# Show the k-means clustering plot
fig4.show()