# Assignment 2 - PML

 Harsh Chandrakar
 
 (PRN-250840125020)


## Q1

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load Data
df_glass = pd.read_csv("Glass.csv")
X = df_glass.drop("Type", axis=1)
y = df_glass["Type"]

# Encode Target (Classes are 1, 2, 3, 5, 6, 7)
le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(np.unique(y_enc))

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_enc, test_size=0.2, random_state=42)

# 1. Neural Network (TensorFlow/Keras)
model_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_nn.fit(X_train, y_train, epochs=50, verbose=0, batch_size=16)

y_pred_nn = model_nn.predict(X_test)
loss_nn = log_loss(y_test, y_pred_nn)
print(f"Neural Network Log Loss: {loss_nn:.4f}")

# 2. Logistic Regression (Grid Search)
params_lr = {'C': [0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']}
grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), params_lr, scoring='neg_log_loss', cv=5)
grid_lr.fit(X_train, y_train)
y_pred_lr = grid_lr.predict_proba(X_test)
loss_lr = log_loss(y_test, y_pred_lr)
print(f"Logistic Regression Log Loss: {loss_lr:.4f} (Best Params: {grid_lr.best_params_})")

# 3. Gradient Boosting (Grid Search)
params_gb = {
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [20, 50],
    'max_depth': [2, 3, None]
}
grid_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), params_gb, scoring='neg_log_loss', cv=5)
grid_gb.fit(X_train, y_train)
y_pred_gb = grid_gb.predict_proba(X_test)
loss_gb = log_loss(y_test, y_pred_gb)
print(f"Gradient Boosting Log Loss: {loss_gb:.4f} (Best Params: {grid_gb.best_params_})")

ModuleNotFoundError: No module named 'tensorflow'

## Q2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load Data
df_sac = pd.read_csv("Sacremento.csv")
df_sac = df_sac.drop(['city', 'zip'], axis=1) # Ignore zip and city

# Preprocessing
X = df_sac.drop('price', axis=1)
y = df_sac['price']

categorical_cols = ['type']
numerical_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Ridge Regression
pipeline_ridge = Pipeline([('pre', preprocessor), ('model', Ridge())])
params_ridge = {'model__alpha': [0, 0.1, 1, 1.5, 2]}
grid_ridge = GridSearchCV(pipeline_ridge, params_ridge, scoring='neg_mean_squared_error', cv=5)
grid_ridge.fit(X_train, y_train)
mse_ridge = mean_squared_error(y_test, grid_ridge.predict(X_test))
print(f"Ridge MSE: {mse_ridge:.2f}")

# 2. Decision Tree
pipeline_dt = Pipeline([('pre', preprocessor), ('model', DecisionTreeRegressor(random_state=42))])
params_dt = {'model__max_depth': [3, 5], 'model__min_samples_split': [4, 10], 'model__min_samples_leaf': [2, 5]}
grid_dt = GridSearchCV(pipeline_dt, params_dt, scoring='neg_mean_squared_error', cv=5)
grid_dt.fit(X_train, y_train)
mse_dt = mean_squared_error(y_test, grid_dt.predict(X_test))
print(f"Decision Tree MSE: {mse_dt:.2f}")

# 3. Random Forest
pipeline_rf = Pipeline([('pre', preprocessor), ('model', RandomForestRegressor(random_state=42))])
params_rf = {'model__max_features': [4, 6, 9]}
grid_rf = GridSearchCV(pipeline_rf, params_rf, scoring='neg_mean_squared_error', cv=5)
grid_rf.fit(X_train, y_train)
mse_rf = mean_squared_error(y_test, grid_rf.predict(X_test))
print(f"Random Forest MSE: {mse_rf:.2f}")

## Q3

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load Data
df_arrests = pd.read_csv("USArrests.csv")
df_arrests.rename(columns={df_arrests.columns[0]: 'State'}, inplace=True)
df_arrests.set_index('State', inplace=True)

# Scale Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_arrests)

# 1. K-Means Clustering & Elbow Plot
wss = []
K_range = range(3, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wss.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(K_range, wss, marker='o')
plt.title('Elbow Method for USArrests')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WSS')
plt.show()

# 2. PCA
pca = PCA()
pca.fit(X_scaled)
cum_var = np.cumsum(pca.explained_variance_ratio_)
print("Cumulative Variance Ratio:", cum_var)
n_components = np.argmax(cum_var >= 0.70) + 1
print(f"Number of components capturing >= 70% variation: {n_components}")

## Q4

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import Holt, ExponentialSmoothing
from sklearn.metrics import mean_squared_error

# Load Data
df_bb = pd.read_csv("BUNDESBANK-BBK01_WT5511.csv")
df_bb['Date'] = pd.to_datetime(df_bb['Date'], format='%d-%m-%Y')
df_bb.set_index('Date', inplace=True)
df_bb.sort_index(inplace=True)

# Split Train/Test (Last 10 values for test)
train = df_bb.iloc[:-10]
test = df_bb.iloc[-10:]

# 1. Holt’s Linear Trend
model_holt = Holt(train['Value']).fit()
pred_holt = model_holt.forecast(10)
mse_holt = mean_squared_error(test['Value'], pred_holt)
print(f"Holt's Linear Trend MSE: {mse_holt:.2f}")

# 2. Holt-Winter’s Method (Additive Seasonality)
# Assuming monthly data, seasonal_periods=12
model_hw = ExponentialSmoothing(train['Value'], trend='add', seasonal='add', seasonal_periods=12).fit()
pred_hw = model_hw.forecast(10)
mse_hw = mean_squared_error(test['Value'], pred_hw)
print(f"Holt-Winter's Additive MSE: {mse_hw:.2f}")

# Plot
plt.figure(figsize=(10, 6))
plt.plot(train.index[-50:], train['Value'].iloc[-50:], label='Train')
plt.plot(test.index, test['Value'], label='Test')
plt.plot(test.index, pred_holt, label='Holt Forecast')
plt.plot(test.index, pred_hw, label='HW Additive Forecast')
plt.legend()
plt.title('Bundesbank Forecast Comparison')
plt.show()