In [1]:
# ETL Pipeline - Internship Task 1
# Author: Manikandan
# Description: Extract, Transform, Load process using Pandas and Scikit-learn

# --- Import required libraries ---
import pandas as pd
from sklearn.preprocessing import StandardScaler

# --- Step 1: Extract ---
# Load raw data (you can replace this with your own CSV file)
# Sample data creation (only use this if no file is given)
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, None],
    'gender': ['Female', 'Male', 'Male', 'Female'],
    'salary': [50000, 60000, None, 52000]
}

df = pd.DataFrame(data)
# Uncomment below line if reading from a file
# df = pd.read_csv('your_raw_data.csv')

print("Raw Data:")
print(df)

# --- Step 2: Transform ---
# 1. Drop rows with missing values
df = df.dropna()

# 2. Convert gender to numerical
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# 3. Scale numeric columns
scaler = StandardScaler()
df[['age', 'salary']] = scaler.fit_transform(df[['age', 'salary']])

print("\nTransformed Data:")
print(df)

# --- Step 3: Load ---
# Save to processed file
df.to_csv('processed_data.csv', index=False)
print("\nProcessed data saved to 'processed_data.csv'")


Raw Data:
      name   age  gender   salary
0    Alice  25.0  Female  50000.0
1      Bob  30.0    Male  60000.0
2  Charlie  35.0    Male      NaN
3    David   NaN  Female  52000.0

Transformed Data:
    name  age  gender  salary
0  Alice -1.0       1    -1.0
1    Bob  1.0       0     1.0

Processed data saved to 'processed_data.csv'
