### **DS1**

Cell 1: Installing SMOTE incase the dataset is imbalance.

In [18]:
# Install imbalanced-learn for SMOTE
!pip install -q imbalanced-learn


Cell 2 : Importing libraries

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.over_sampling import SMOTE


Cell 3 : Load and Preprocess Dataset

In [20]:
# Load your uploaded file
df = pd.read_csv("salesmonthly.csv")

# Convert 'datum' to datetime and set as index
df['datum'] = pd.to_datetime(df['datum'], dayfirst=True)
df.set_index('datum', inplace=True)

# Basic Info
print(df.info())
df.head()


  df['datum'] = pd.to_datetime(df['datum'], dayfirst=True)


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 70 entries, 2014-01-31 to 2019-10-31
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   M01AB   70 non-null     float64
 1   M01AE   70 non-null     float64
 2   N02BA   70 non-null     float64
 3   N02BE   70 non-null     float64
 4   N05B    70 non-null     float64
 5   N05C    70 non-null     int64  
 6   R03     70 non-null     float64
 7   R06     70 non-null     float64
dtypes: float64(7), int64(1)
memory usage: 4.9 KB
None


Unnamed: 0_level_0,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06
datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-01-31,127.69,99.09,152.1,878.03,354.0,50,112.0,48.2
2014-02-28,133.32,126.05,177.0,1001.9,347.0,31,122.0,36.2
2014-03-31,137.44,92.95,147.655,779.275,232.0,20,112.0,85.4
2014-04-30,113.1,89.475,130.9,698.5,209.0,18,97.0,73.7
2014-05-31,101.79,119.933,132.1,628.78,270.0,23,107.0,123.7


Cell 4 : Normalize and Apply Clustering (Unsupervised Learning)

In [21]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans_labels = kmeans.fit_predict(X_scaled)
sil_kmeans = silhouette_score(X_scaled, kmeans_labels)

# DBSCAN clustering
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
sil_dbscan = silhouette_score(X_scaled, dbscan_labels)

# Print Silhouette Scores
print("Silhouette Score (KMeans):", sil_kmeans)
print("Silhouette Score (DBSCAN):", sil_dbscan)


Silhouette Score (KMeans): 0.17742955794674686
Silhouette Score (DBSCAN): -0.04822971423364046


Cell 5 : Feature Selection Setup

In [23]:
# Simulate classification by binning 'M01AB'
df['label'] = pd.cut(df['M01AB'], bins=3, labels=[0, 1, 2])

print("Label distribution:")
print(df['label'].value_counts())

X = df.drop('label', axis=1)
y = df['label']



# Select top 5 features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)


Label distribution:
label
2    47
1    21
0     2
Name: count, dtype: int64


Cell 6 : Imbalance Learning (SMOTE)

In [24]:
# Check class distribution before SMOTE
print("Original Class Distribution:", dict(zip(*np.unique(y_train, return_counts=True))))

# Find the smallest class count
min_class_count = min(np.unique(y_train, return_counts=True)[1])
k_neighbors = min(5, min_class_count - 1)

# Apply SMOTE safely
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("Balanced Class Distribution:", dict(zip(*np.unique(y_train_bal, return_counts=True))))


Original Class Distribution: {np.int64(0): np.int64(119), np.int64(1): np.int64(117)}
Balanced Class Distribution: {np.int64(0): np.int64(119), np.int64(1): np.int64(119)}


Cell 7 : Train and Evaluate 5 Supervised Models

In [25]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

# Train and Evaluate
for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    print(f"\n🔹 {name}")
    print("Accuracy:", model.score(X_test, y_test))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔹 Logistic Regression
Accuracy: 0.7166666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.48      0.62        29
           1       0.66      0.94      0.77        31

    accuracy                           0.72        60
   macro avg       0.77      0.71      0.70        60
weighted avg       0.76      0.72      0.70        60

Confusion Matrix:
 [[14 15]
 [ 2 29]]

🔹 Decision Tree
Accuracy: 0.6833333333333333
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.59      0.64        29
           1       0.67      0.77      0.72        31

    accuracy                           0.68        60
   macro avg       0.69      0.68      0.68        60
weighted avg       0.69      0.68      0.68        60

Confusion Matrix:
 [[17 12]
 [ 7 24]]

🔹 Random Forest
Accuracy: 0.6833333333333333
Classification Report:
               precision    recall  f1-score   support


# **DS2**

Cell 1: Load and Explore Dataset

In [4]:
import pandas as pd

# Load the dataset
ds2 = pd.read_csv('StudentsPerformance.csv')

# Display first few rows
ds2.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Cell 2: Check for Nulls & Basic Info

In [5]:
# Check dataset info
ds2.info()

# Check for missing values
ds2.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


Cell 3: Encoding Categorical Data

In [6]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
le = LabelEncoder()
for col in ds2.columns:
    if ds2[col].dtype == 'object':
        ds2[col] = le.fit_transform(ds2[col])

ds2.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75


Cell 4: Feature Extraction Techniques (PCA and LDA)

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = ds2.drop('math score', axis=1)
y = ds2['math score']

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)


LDA

In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Convert 'math score' to a class label for LDA
y_class = pd.qcut(y, q=3, labels=[0,1,2])

lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(X_scaled, y_class)


Cell 5: Classification Using PCA Features

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X_pca, y_class, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.78      0.81        74
           1       0.54      0.58      0.56        65
           2       0.70      0.70      0.70        61

    accuracy                           0.69       200
   macro avg       0.70      0.69      0.69       200
weighted avg       0.70      0.69      0.70       200

[[58 14  2]
 [11 38 16]
 [ 0 18 43]]


 Cell 6: Classification Using LDA Features

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_lda, y_class, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.75      0.76      0.75        74
           1       0.61      0.57      0.59        65
           2       0.83      0.87      0.85        61

    accuracy                           0.73       200
   macro avg       0.73      0.73      0.73       200
weighted avg       0.73      0.73      0.73       200

[[56 18  0]
 [17 37 11]
 [ 2  6 53]]


# **DS3**

Cell 1: Load and Explore Dataset

In [11]:
ds3 = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
ds3.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Cell 2: Clean Data

In [12]:
ds3 = ds3.dropna()
ds3 = ds3.drop(['Loan_ID'], axis=1)

for col in ds3.columns:
    if ds3[col].dtype == 'object':
        ds3[col] = le.fit_transform(ds3[col])

ds3.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,1


Cell 3: Check Balance

In [13]:
ds3['Loan_Status'].value_counts()


Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
1,332
0,148


Cell 4: Before Balancing - Model & Evaluation

In [14]:
X = ds3.drop('Loan_Status', axis=1)
y = ds3['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Before Balancing")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Before Balancing
              precision    recall  f1-score   support

           0       0.87      0.46      0.60        28
           1       0.81      0.97      0.89        68

    accuracy                           0.82        96
   macro avg       0.84      0.72      0.75        96
weighted avg       0.83      0.82      0.80        96

[[13 15]
 [ 2 66]]


Cell 5: Apply SMOTE and Classify

In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("After SMOTE Balancing")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


After SMOTE Balancing
              precision    recall  f1-score   support

           0       0.87      0.82      0.84        65
           1       0.83      0.88      0.86        68

    accuracy                           0.85       133
   macro avg       0.85      0.85      0.85       133
weighted avg       0.85      0.85      0.85       133

[[53 12]
 [ 8 60]]


Cell 6: Apply RandomUnderSampler

In [16]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("After RandomUnderSampler")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


After RandomUnderSampler
              precision    recall  f1-score   support

           0       0.78      0.62      0.69        29
           1       0.70      0.84      0.76        31

    accuracy                           0.73        60
   macro avg       0.74      0.73      0.73        60
weighted avg       0.74      0.73      0.73        60

[[18 11]
 [ 5 26]]
