<a href="https://colab.research.google.com/github/tenacioussami/Data-science/blob/main/5_Data_Preprocessing_and_Pipeline_Skill_Morph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


# **Loading and Exploring the Dataset**

In [4]:
df = pd.read_csv('/content/drive/MyDrive/datasets/diabetes.csv')  # No header in raw CSV

In [None]:
# Basic exploration
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

print("\nCheck for missing values:")
print(df.isnull().sum())

print("\nTarget variable distribution:")
print(df['Outcome'].value_counts())

Dataset Shape: (768, 9)

First 5 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 

# **Train-Test Split**

In [None]:
# Separate input (X) and output (y)
X = df.drop('Outcome', axis=1)  # Everything except Outcome
y = df['Outcome']                # Only Outcome

print("X has all patient information (8 columns)")
print("y has diabetes yes/no (1 column)")

# Split: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,    # 20% for test
    random_state=20   # Same result every time
)

print(f"\nWe will train with: {len(X_train)} patients")
print(f"We will test with: {len(X_test)} patients")

X has all patient information (8 columns)
y has diabetes yes/no (1 column)

We will train with: 614 patients
We will test with: 154 patients


# **Why Scale?**

In [None]:
# Problem: Some numbers are big, some are small
print("Before scaling - different ranges:")
print(f"Age: smallest={X_train['Age'].min()}, biggest={X_train['Age'].max()}")
print(f"Insulin: smallest={X_train['Insulin'].min()}, biggest={X_train['Insulin'].max()}")
print("\nAge goes from 21 to 81 (small range)")
print("Insulin goes from 0 to 846 (big range!)")
print("This confuses the model!")

Before scaling - different ranges:
Age: smallest=21, biggest=81
Insulin: smallest=0, biggest=744

Age goes from 21 to 81 (small range)
Insulin goes from 0 to 846 (big range!)
This confuses the model!




```
# This is formatted as code
```

# **StandardScaler**

In [None]:
# Create scaler
scaler = StandardScaler()

# Learn from training data and scale it
X_train_scaled = scaler.fit_transform(X_train)

# Scale test data (just transform, don't fit)
X_test_scaled = scaler.transform(X_test)

print("\nAfter scaling:")
print("All features now have similar range!")
print(f"Example - First patient's age before scaling: {X_train.values[0][7]}")
print(f"Example - First patient's age after scaling: {X_train_scaled[0][7]:.2f}")


After scaling:
All features now have similar range!
Example - First patient's age before scaling: 28.0
Example - First patient's age after scaling: -0.43


# **MinMaxScaler (Min-Max normalization)**

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create MinMaxScaler
minmax_scaler = MinMaxScaler()

# Learn from training data and scale it
X_train_minmax = minmax_scaler.fit_transform(X_train)

# Scale test data
X_test_minmax = minmax_scaler.transform(X_test)

print("\nAfter MinMaxScaler:")
print("All features now between 0 and 1!")
print(f"Example - First patient's age before scaling: {X_train.values[0][7]}")
print(f"Example - First patient's age after scaling: {X_train_minmax[0][7]:.2f}")

# Let's verify MinMaxScaler worked
print("\nChecking MinMaxScaler:")
print(f"  Minimum value: {X_train_minmax.min():.1f} (should be 0)")
print(f"  Maximum value: {X_train_minmax.max():.1f} (should be 1)")


After MinMaxScaler:
All features now between 0 and 1!
Example - First patient's age before scaling: 28.0
Example - First patient's age after scaling: 0.12

Checking MinMaxScaler:
  Minimum value: 0.0 (should be 0)
  Maximum value: 1.0 (should be 1)


# **Encoding Categorical Data**

In [None]:
# Let's create a sample dataset with text data
sample_data = pd.DataFrame({
    'age': [25, 30, 35, 40],
    'gender': ['male', 'female', 'male', 'female'],
    'smoker': ['yes', 'no', 'yes', 'no'],
    'disease': [1, 0, 1, 0]
})

print("Original data with text:")
print(sample_data)

Original data with text:
   age  gender smoker  disease
0   25    male    yes        1
1   30  female     no        0
2   35    male    yes        1
3   40  female     no        0


# **Label Encoding (For Binary/Two Categories)**

In [None]:
from sklearn.preprocessing import LabelEncoder

# For yes/no or male/female (2 categories only)
label_encoder = LabelEncoder()

# Convert 'smoker' column
sample_data['smoker_encoded'] = label_encoder.fit_transform(sample_data['smoker'])
print("\nAfter Label Encoding 'smoker':")
print(sample_data[['smoker', 'smoker_encoded']])
# no = 0, yes = 1

# Convert 'gender' column
sample_data['gender_encoded'] = label_encoder.fit_transform(sample_data['gender'])
print("\nAfter Label Encoding 'gender':")
print(sample_data[['gender', 'gender_encoded']])
# female = 0, male = 1


After Label Encoding 'smoker':
  smoker  smoker_encoded
0    yes               1
1     no               0
2    yes               1
3     no               0

After Label Encoding 'gender':
   gender  gender_encoded
0    male               1
1  female               0
2    male               1
3  female               0


# **One-Hot Encoding (For Multiple Categories)**

In [None]:
# Let's add a column with multiple categories
sample_data['city'] = ['Delhi', 'Mumbai', 'Delhi', 'Kolkata']

# One-Hot Encoding using pandas
city_encoded = pd.get_dummies(sample_data['city'], prefix='city')
print("\nOne-Hot Encoding for 'city':")
print(city_encoded)
# Creates separate columns for each city

# Add to original data
sample_data = pd.concat([sample_data, city_encoded], axis=1)
print("\nFinal data with all encodings:")
print(sample_data)


One-Hot Encoding for 'city':
   city_Delhi  city_Kolkata  city_Mumbai
0        True         False        False
1       False         False         True
2        True         False        False
3       False          True        False

Final data with all encodings:
   age  gender smoker  disease  smoker_encoded  gender_encoded     city  \
0   25    male    yes        1               1               1    Delhi   
1   30  female     no        0               0               0   Mumbai   
2   35    male    yes        1               1               1    Delhi   
3   40  female     no        0               0               0  Kolkata   

   city_Delhi  city_Kolkata  city_Mumbai  
0        True         False        False  
1       False         False         True  
2        True         False        False  
3       False          True        False  


# **Train a Simple Model**

In [None]:
from sklearn.linear_model import LogisticRegression

# Create model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_scaled, y_train)
print("✅ Model trained!")

# Check accuracy on training data
train_score = model.score(X_train_scaled, y_train)
print(f"Training accuracy: {train_score:.1%}")

# Check accuracy on test data
test_score = model.score(X_test_scaled, y_test)
print(f"Test accuracy: {test_score:.1%}")

✅ Model trained!
Training accuracy: 78.5%
Test accuracy: 75.3%


# **Cross-Validation**

In [None]:
from sklearn.model_selection import cross_val_score

# Do 5-fold cross-validation
scores = cross_val_score(
    model,           # Our model
    X_train_scaled,  # Training data
    y_train,         # Training labels
    cv=5            # 5 mini-tests
)

print("5 mini-test scores:")
for i in range(5):
    print(f"  Test {i+1}: {scores[i]:.1%}")

print(f"\nAverage: {scores.mean():.1%}")
print(f"This means our model is {scores.mean():.1%} accurate!")

5 mini-test scores:
  Test 1: 78.9%
  Test 2: 80.5%
  Test 3: 78.9%
  Test 4: 74.8%
  Test 5: 77.9%

Average: 78.2%
This means our model is 78.2% accurate!


# **Save Everything**

In [None]:
import joblib

# Save the model
joblib.dump(model, 'my_model.pkl')
print("✅ Model saved as 'my_model.pkl'")

# Save the scaler
joblib.dump(scaler, 'my_scaler.pkl')
print("✅ Scaler saved as 'my_scaler.pkl'")

# Save scaled data as CSV
pd.DataFrame(X_train_scaled).to_csv('train_data_scaled.csv', index=False)
print("✅ Scaled data saved as 'train_data_scaled.csv'")

✅ Model saved as 'my_model.pkl'
✅ Scaler saved as 'my_scaler.pkl'
✅ Scaled data saved as 'train_data_scaled.csv'


# **Use Saved Model for New Patient**

In [None]:
# Load saved model and scaler
model = joblib.load('my_model.pkl')
scaler = joblib.load('my_scaler.pkl')

# New patient data (8 features)
print("\nNew patient information:")
new_patient = [[5, 116, 74, 0, 0, 25.6, 0.201, 30]]
print("Pregnancies: 5")
print("Glucose: 116")
print("Blood Pressure: 74")
print("Age: 30")
print("(and other features...)")

# Scale the new patient data
new_patient_scaled = scaler.transform(new_patient)

# Make prediction
prediction = model.predict(new_patient_scaled)

if prediction[0] == 1:
    print("\n⚠️ Result: Risk of diabetes")
else:
    print("\n✅ Result: No diabetes risk")


New patient information:
Pregnancies: 5
Glucose: 116
Blood Pressure: 74
Age: 30
(and other features...)

✅ Result: No diabetes risk


# **Complete Code - All Steps Together**

In [None]:
# Complete simple pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

# 1. Load data
df = pd.read_csv('/content/drive/MyDrive/datasets/diabetes.csv')
print(f"Loaded {len(df)} patients")

# 2. Split X and y
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training: {len(X_train)}, Testing: {len(X_test)}")

# 4. Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Data scaled")

# 5. Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
print("Model trained")

# 6. Check accuracy
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy:.1%}")

# 7. Save
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Everything saved!")

Loaded 768 patients
Training: 614, Testing: 154
Data scaled
Model trained
Accuracy: 75.3%
Everything saved!


In [None]:
#Quick Reference - Most Important Code
#Split Data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Scale Data:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#Train Model:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
#Save Model:
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
'''
Remember These Points:

Always split data first - Never use test data for training
Scale after splitting - Fit scaler only on training data
Save your work - So you can use it later
Test on new data - To see if model really works
'''

'\nRemember These Points:\n\nAlways split data first - Never use test data for training\nScale after splitting - Fit scaler only on training data\nSave your work - So you can use it later\nTest on new data - To see if model really works\n'