In [None]:
# Imported libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from google.colab import files
from IPython.display import display

# Uploads the WA_Fn-UseC_-Telco-Customer-Churn.csv input file
files.upload()

# Reads and writes the file using the pandas import
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Drops the unique value of customerID
df.drop('customerID', axis = 1, inplace = True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Use a default median value to fill in for missing values in numerical columns.
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median()) # TotalCharges column is numerical.

# Use a default mode value to fill in for missing values in categorical columns.
for column in df.select_dtypes(include=['object']).columns: # Any column with an object is categorical.
    df[column] = df[column].fillna(df[column].mode()[0])

# x and y variables used for target variable Churn
x = df.drop('Churn', axis = 1)
y = df['Churn'].map({'No': 0, 'Yes': 1})

# Identifies categorical columns and numerical columns
categorical_columns = x.select_dtypes(include=['object']).columns.tolist()
numerical_columns = x.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessed = ColumnTransformer(transformers=[
        ('numerical', StandardScaler(), numerical_columns),
         ('categorical', OneHotEncoder(), categorical_columns)
])

# Function used to evaluate models with 5-fold cross-validation
# (Accuracy, Precision, Recall, F1-Score, ROC-AUC)
def evaluate_model(model, x, y):
    pipeline = Pipeline(steps=[('pre', preprocessed), ('clf', model)])

    # Metrics used for evaluation
    score = {
        'Accuracy': make_scorer(accuracy_score),
        'Precision': make_scorer(precision_score),
        'Recall': make_scorer(recall_score),
        'F1-Score': make_scorer(f1_score),
        'ROC-AUC': make_scorer(roc_auc_score)
    }
    model_result = {metric: cross_val_score(pipeline, x, y, cv=5, scoring=scorer).mean()
            for metric, scorer in score.items()}
    return model_result

# The two classification models used
log_regression_model = LogisticRegression(max_iter = 1000) # Logarithmic Regression
nb_model = GaussianNB() # Naive Bayes

results = {
    "Logistic Regression": evaluate_model(log_regression_model, x, y),
    "Naive Bayes": evaluate_model(nb_model, x, y)
}

# Displays styled results in a data frame
data_frame = pd.DataFrame(results).T
data_frame_style = data_frame.style.set_properties(**{'text-align': 'left'}).format({
        'Accuracy': '{:.2%}',
        'Precision': '{:.2%}',
        'Recall': '{:.2%}',
        'F1-Score': '{:.2%}',
        'ROC-AUC': '{:.5f}'
    }).set_table_styles(
        [{'selector': 'td, th',
          'props': [('border', '1px solid white')]}])
display(data_frame_style)

# Summary of Problem 1 in Assignment 2 - Basic Machine Learning

# For this project, my goal was to implement two classification models: Logistic Regression and Naive Bayes to predict.
# the variable 'Churn' using the Telco Customer Churn dataset presented in the csv file. I dropped the customerID column in the file,
# changed the TotalCharges column to numeric values, and filled in its missing values using the median for
# numerical (integer, float) columns and the mode for categorical (object) columns. Categorical columns were preprocessed
# using a OneHotEncoder, while the numerical columns were preprocessed using a StandardScaler. These models were evaluated
# using the following metrics for 5-fold cross-validation: Accuracy, Precision, Recall, F1-Score, and ROC-AUC. The results showed that the
# accuracy and precision was higher in Logistic Regression, while the recall was higher in Naive Bayes, and the F1-Score
# and ROC-AUC values for both models produced very similar results.

In [None]:
# Imported libraries
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from google.colab import files
from IPython.display import display

# Uploads the test_image.png input file
files.upload()

# Reads and writes the image file using OpenCV
img = cv2.imread('test_image.png')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Input values for the reshape dimension of compressed image
while True:
    try:
        reshape_dimension = int(input("\nEnter the reshape dimension for this compressed image (Must be 1, 2, 3, 4): \n"))
        if reshape_dimension in [1, 2, 3, 4]: # Any other value that isn't between 1 and 4 will give me an error
            break
        else:
            print("\nInvalid integer. Please enter 1, 2, 3, or 4.")
    except ValueError:
        print("\nInvalid input. Please enter an integer that is 1, 2, 3, or 4.")

# Shows the original image
plt.figure(figsize=(11, 6))
plt.imshow(img_rgb)
plt.title("Original Image")
plt.axis("off")
plt.show()

# Compresses the image using k-means clustering
def compress_image(img_array, k, reshape_dimension):
    pixel = img_array.reshape(-1, reshape_dimension).astype(np.float32)
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(pixel)
    labels = kmeans.predict(pixel)
    compressed_pixel = kmeans.cluster_centers_[labels].astype(np.uint8)
    compressed_img = compressed_pixel.reshape(img_array.shape)
    return compressed_img

# Shows the image that comes up after the Mean Squared Error (MSE) is calculated
# between the original and compressed images
def mean_standard_error(original_img, compressed_img):
  plt.figure(figsize=(11, 6))
  calculation = np.mean((original_img.astype("float") - compressed_img.astype("float")) ** 2)
  return calculation

# Different values of k that were being used
errors = []
k_values = [2, 4, 8, 16, 32]

# K-Value loop that calculates the MSE
for k in k_values:
    compressed_img = compress_image(img_rgb, k, reshape_dimension)
    error = mean_standard_error(img_rgb, compressed_img)
    errors.append(error)

    # Shows the compressed image
    plt.imshow(compressed_img)
    plt.title(f'Compressed Image (K={k})')
    plt.axis('off')
    plt.show()
    cv2.imwrite(f'compressed_k_{k}.png', cv2.cvtColor(compressed_img, cv2.COLOR_RGB2BGR))

# Plots the number of clusters (K) vs. mean squared error (MSE)
plt.plot(k_values, errors, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.xticks(np.arange(0, 34, 2))
plt.ylabel('Mean Squared Error (MSE)')
plt.yticks(np.arange(0, 2100, 100))
plt.title('Number of Clusters (K) vs. Mean Squared Error (MSE)')
plt.grid(True)
plt.show()

results = {
    'Number of Clusters (K)': k_values,
    'Mean Squared Error (MSE)': errors
}

# Displays styled results in a data frame
data_frame = pd.DataFrame(results)
data_frame_style = data_frame.style.set_properties(**{'text-align': 'left'}).hide(axis='index').format({
    'Number of Clusters (K)': '{:d}',
    'Mean Squared Error (MSE)': '{:.10f}'
}).set_table_styles(
        [{'selector': 'td, th',
          'props': [('border', '1px solid white')]}])
display(data_frame_style)

# Summary of Problem 2 in Assignment 2 - Basic Machine Learning

# For this project, my goal was to implement a naive version of a k-means clustering algorithm to compress
# the test_image.png file by using a limited color palette. The image was read and written using OpenCV, converted from BGR to RGB format,
# and reshaped into pixels of a user-specified reshape dimension between 1, 2, 3 and 4 for k-means clustering. This process was tested and repeated
# using k-values of 2, 4, 8, 16, and 32. The Mean Squared Error (MSE) was then calculated for each k-value after each compressed image relative to the original.
# As k increased, the MSE decreased, showing that the quality of each image gradually improved and became closer to the original image.
# The results also show that a reshape dimension of 1 has more improved quality images, while a reshape dimension of 4 has less improved quality images.