In [None]:
# UBER FARE PREDICTION



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("uber.csv")
print(df.head())


df.info()

# Check for missing values
df.isna().sum()

# Drop rows with missing values
df.dropna(inplace=True)
print("After dropping NaNs:", df.shape)


# Remove invalid fare amounts (negative or zero)
df = df[df["fare_amount"] > 0]

# Remove invalid passenger counts
df = df[(df["passenger_count"] > 0) & (df["passenger_count"] <= 6)]




# Boxplot before removing outliers
plt.figure(figsize=(6, 3))
sns.boxplot(x=df["fare_amount"], color="lightcoral")

plt.title("Fare Amount Distribution (Before Outlier Removal)")
plt.show()


# --- Identify and remove outliers using IQR ---
Q1 = df["fare_amount"].quantile(0.25)
Q3 = df["fare_amount"].quantile(0.75)
IQR = Q3 - Q1

# Keep only data within 1.5 * IQR
df = df[
    (df["fare_amount"] >= Q1 - 1.5 * IQR)
    & (df["fare_amount"] <= Q3 + 1.5 * IQR)
]

# Boxplot after removing outliers
plt.figure(figsize=(6, 3))
sns.boxplot(x=df["fare_amount"], color="skyblue")
plt.title("Fare Amount Distribution (After Outlier Removal)")
plt.show()


def haversine(lon1, lat1, lon2, lat2):
    """Calculate great-circle distance (km) between two points."""
    R = 6371  # Earth radius (km)
    lon1, lon2, lat1, lat2 = map(np.radians, [lon1, lon2, lat1, lat2])
    dlon, dlat = lon2 - lon1, lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

# Compute distance and add as new column
df["distance_km"] = haversine(
    df["pickup_longitude"],
    df["pickup_latitude"],
    df["dropoff_longitude"],
    df["dropoff_latitude"],
)

# Remove entries with zero or very small distance
df = df[df["distance_km"] > 0.1]

df.head()


corr_matrix = df[["fare_amount", "distance_km", "passenger_count"]].corr()

plt.figure(figsize=(5, 4))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


# Features and target
X = df[["distance_km", "passenger_count"]]
y = df["fare_amount"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# ---- Linear Regression ----
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


# ---- Random Forest Regression ----
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

rmse_lr, r2_lr = evaluate_model(y_test, y_pred_lr, "Linear Regression")
rmse_rf, r2_rf = evaluate_model(y_test, y_pred_rf, "Random Forest")

comparison = pd.DataFrame(
    {
        "Model": ["Linear Regression", "Random Forest"],
        "RMSE": [rmse_lr, rmse_rf],
        "RÂ² Score": [r2_lr, r2_rf],
    }
)
print("\nModel Comparison:")
print(comparison)


In [None]:
# EMAIL SPAM CLASSIFICATION


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score


df = pd.read_csv("emails.csv")
df.head()
df.info()
print("Initial shape:", df.shape)


# Drop rows with missing values
df.dropna(inplace=True)
print("After dropping missing values:", df.shape)

X = df.iloc[:, 1:-1]  # word count features
y = df.iloc[:, -1]    # labels: 'spam' or 'not spam'

# Check class distribution
sns.countplot(x=y)
plt.title("Class Distribution: Not Spam vs Spam")
plt.show()


# Split data (stratify to preserve class ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# KNN for multiple k values, store results
k_values = [3, 5, 7]
knn_results = {}  # store metrics for each k

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred_knn = knn.predict(X_test_scaled)
    
    # Store metrics
    knn_results[k] = {
        "accuracy": accuracy_score(y_test, y_pred_knn),
        "precision": precision_score(y_test, y_pred_knn, pos_label=1),
        "recall": recall_score(y_test, y_pred_knn, pos_label=1),
        "f1_score": f1_score(y_test, y_pred_knn, pos_label=1),
        "confusion_matrix": confusion_matrix(y_test, y_pred_knn)
    }
    
    # Print metrics
    print(f"\nKNN with k={k}")
    print(f"Accuracy: {knn_results[k]['accuracy']:.3f}")
    print(f"Precision: {knn_results[k]['precision']:.3f}")
    print(f"Recall: {knn_results[k]['recall']:.3f}")
    print(f"F1-score: {knn_results[k]['f1_score']:.3f}")
    print(f"Confusion Matrix:\n{knn_results[k]['confusion_matrix']}")

# Initialize SVM with default parameters
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

# Store metrics
svm_metrics = {
    "accuracy": accuracy_score(y_test, y_pred_svm),
    "precision": precision_score(y_test, y_pred_svm, pos_label=1),
    "recall": recall_score(y_test, y_pred_svm, pos_label=1),
    "f1_score": f1_score(y_test, y_pred_svm, pos_label=1),
    "confusion_matrix": confusion_matrix(y_test, y_pred_svm)
}

# Print metrics
print("\nSVM (Default Settings)")
print(f"Accuracy: {svm_metrics['accuracy']:.3f}")
print(f"Precision: {svm_metrics['precision']:.3f}")
print(f"Recall: {svm_metrics['recall']:.3f}")
print(f"F1-score: {svm_metrics['f1_score']:.3f}")
print(f"Confusion Matrix:\n{svm_metrics['confusion_matrix']}")



# Compare metrics for KNN (k=3) and SVM
best_k = 3
metrics = ["Accuracy", "Precision", "Recall", "F1-score"]

knn_scores = [
    knn_results[best_k]["accuracy"],
    knn_results[best_k]["precision"],
    knn_results[best_k]["recall"],
    knn_results[best_k]["f1_score"]
]

svm_scores = [
    svm_metrics["accuracy"],
    svm_metrics["precision"],
    svm_metrics["recall"],
    svm_metrics["f1_score"]
]

# Plot grouped bar chart
x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(8,5))
plt.bar(x - width/2, knn_scores, width, label=f'KNN (k={best_k})', color='blue')
plt.bar(x + width/2, svm_scores, width, label='SVM', color='green')

plt.xticks(x, metrics)
plt.ylim(0,1.05)
plt.ylabel("Score")
plt.title("KNN vs SVM: Classification Metrics")
plt.legend()
plt.show()



In [None]:
# CUSTOMER BANK CHURN PREDICTION

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


df = pd.read_csv("Churn_Modelling.csv")
df.head()

print("Initial shape:", df.shape)

df.info()

# Drop irrelevant columns
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# Encode categorical variables
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

le_geo = LabelEncoder()
df['Geography'] = le_geo.fit_transform(df['Geography'])

df.head()


# Split features and target
X = df.drop('Exited', axis=1)
y = df['Exited']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


def build_and_train_nn(X_train, y_train, X_test, y_test, 
                       hidden_layers=[64,32], activation='relu', epochs=50):
    # Initialize model
    model = Sequential()
    model.add(Dense(hidden_layers[0], input_dim=X_train.shape[1], activation=activation))
    
    # Hidden layers
    for units in hidden_layers[1:]:
        model.add(Dense(units, activation=activation))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    
    # Compile
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train
    history = model.fit(X_train, y_train, validation_split=0.2, epochs=epochs, batch_size=32, verbose=0)
    
    # Predictions
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"Activation: {activation}, Epochs: {epochs}")
    print(f"Accuracy: {acc:.3f}")
    print("Confusion Matrix:\n", cm)
    
    return model, history




# Experiment with different activations and epochs
activations = ['relu', 'tanh', 'sigmoid']
epochs_list = [25, 50, 75]

results = []

for act in activations:
    for ep in epochs_list:
        model, history = build_and_train_nn(X_train_scaled, y_train, X_test_scaled, y_test,
                                            activation=act, epochs=ep)
        results.append((act, ep, model, history))




import matplotlib.pyplot as plt

# One subplot per activation function
activations = sorted(list(set([r[0] for r in results])))
epochs_list = sorted(list(set([r[1] for r in results])))

plt.figure(figsize=(12, 6))

for i, act in enumerate(activations):
    plt.subplot(1, len(activations), i+1)
    
    for act_r, ep, model, history in results:
        if act_r == act:
            plt.plot(history.history['val_accuracy'], label=f'{ep} epochs')
    
    plt.title(f'Validation Accuracy - {act}')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.legend()

plt.tight_layout()
plt.show()



In [None]:
# K NEAREST NEIGHBORS


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


df = pd.read_csv("diabetes.csv")
df.head()

df.shape
df.info()

# Check for missing values
df.isnull().sum()


# Columns where zero is invalid and should be replaced
zero_invalid_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
# Replace zeros with median values in these columns
for col in zero_invalid_cols:
    median_val = df[col].median()
    df[col] = df[col].replace(0, median_val)

# Split features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Train-test split (80-20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Define parameters to experiment
k_values = [3, 5, 7]
distance_metrics = ['euclidean', 'manhattan', 'minkowski', 'chebyshev']
minkowski_p = 3  # for Minkowski distance

# Dictionary to store results
knn_results = {}

for metric in distance_metrics:
    for k in k_values:
        if metric == 'minkowski':
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric, p=minkowski_p)
        else:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
        
        knn.fit(X_train_scaled, y_train)
        y_pred = knn.predict(X_test_scaled)
        
        # Store metrics
        knn_results[(metric, k)] = {
            "accuracy": accuracy_score(y_test, y_pred),
            "error_rate": 1 - accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
            "confusion_matrix": confusion_matrix(y_test, y_pred)
        }
        
        # Print metrics
        print(f"\nKNN with k={k}, metric={metric}")
        print(f"Accuracy: {knn_results[(metric, k)]['accuracy']:.3f}")
        print(f"Error Rate: {knn_results[(metric, k)]['error_rate']:.3f}")
        print(f"Precision: {knn_results[(metric, k)]['precision']:.3f}")
        print(f"Recall: {knn_results[(metric, k)]['recall']:.3f}")
        print(f"F1-score: {knn_results[(metric, k)]['f1_score']:.3f}")
        print("Confusion Matrix:\n", knn_results[(metric, k)]['confusion_matrix'])





# Convert results to DataFrame for plotting
results_df = pd.DataFrame(knn_results).T  # transpose so (metric, k) is index
results_df[['accuracy', 'precision', 'recall', 'f1_score']].plot(
    kind='bar', figsize=(14,6)
)
plt.title("KNN Performance Metrics for Different k and Distance Metrics")
plt.xlabel("(Distance Metric, k)")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.legend(loc='lower right')
plt.show()


In [None]:
# Data Clustering K MEANS AND HIERARCHICAL




import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.decomposition import PCA

df = pd.read_csv("sales_data_sample.csv", encoding='latin1')
df.head()
df.shape
df.info()


#Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Numeric features selected for clustering
numeric_cols = ['QUANTITYORDERED', 'PRICEEACH', 'SALES']

# Drop rows with missing values in these columns
X = df[numeric_cols].dropna()


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Shape of scaled data:", X_scaled.shape)

# Fill missing values with median
df_numeric = df_numeric.fillna(df_numeric.median())


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)
X_scaled.shape



# Correlation heatmap
plt.figure(figsize=(6,5))
sns.heatmap(pd.DataFrame(X_scaled, columns=numeric_cols).corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numeric Features")
plt.show()


# Scatter plot example: SALES vs PRICEEACH
plt.figure(figsize=(6,4))
sns.scatterplot(x=X['PRICEEACH'], y=X['SALES'])
plt.title("SALES vs PRICEEACH")
plt.show()


# Elbow Method
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot Elbow Method
plt.figure(figsize=(6,4))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal k")
plt.show()

# Choose optimal k (k=4 from elbow plot)
k_optimal = 4
kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
X['cluster'] = kmeans.fit_predict(X_scaled)

# Reduce to 2D using PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


plt.figure(figsize=(8,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=X['cluster'], cmap='viridis', marker='o')
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title(f"K-Means Clusters Visualization (k={k_optimal})")
plt.show()

# Choose optimal k (k=3 from elbow plot)
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
X['cluster'] = kmeans.fit_predict(X_scaled)

# Reduce to 2D using PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


plt.figure(figsize=(8,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=X['cluster'], cmap='viridis', marker='o')
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title(f"K-Means Clusters Visualization (k={k_optimal})")
plt.show()




# Compute linkage matrix
Z = linkage(X_scaled, method='ward')

# Plot dendrogram
plt.figure(figsize=(10,6))
dendrogram(Z, truncate_mode='level', p=5)
plt.title("Hierarchical Clustering Dendrogram (truncated)")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()

# Form flat clusters (example: 4 clusters)
clusters_h = fcluster(Z, t=4, criterion='maxclust')
X['cluster_hier'] = clusters_h


# Combine cluster labels with original numeric features
X_summary = X.copy()
X_summary[numeric_cols] = df[numeric_cols].loc[X.index]

# Summary statistics per cluster
cluster_stats = X_summary.groupby('cluster')[numeric_cols].mean()
display(cluster_stats)