***DATA PIPELINE DEVELOPMENT***

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [7]:
# Step 1: Load the dataset
data = pd.read_csv("D:/intern.csv")  # Ensure data.csv is in the same directory
print("Original Data Preview:\n", data.head())

Original Data Preview:
    ID     Name  Age  Gender           City  Income Education Marital_Status  \
0   1    Alice   25  Female       New York   50000  Bachelor         Single   
1   2      Bob   30    Male    Los Angeles   60000    Master        Married   
2   3  Charlie   35    Male        Chicago   70000       PhD        Married   
3   4    Diana   40  Female  San Francisco   55000  Bachelor       Divorced   
4   5    Ethan   28    Male        Seattle   62000    Master         Single   

          Occupation  Loan_Amount  ...  Work_Hours  Screen_Time  \
0           Engineer        10000  ...           8            5   
1             Doctor        15000  ...           9            4   
2          Scientist        20000  ...           7            6   
3             Artist        12000  ...           7            5   
4  Software Engineer        18000  ...           9            3   

   Depression_Score Anxiety_Level Job_Satisfaction  Happiness_Score  \
0                 2        

In [9]:
# Step 2: Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=["object"]).columns.drop(["Name", "Diabetes_Risk"], errors="ignore")
numerical_cols = data.select_dtypes(exclude=["object"]).columns

print("\nCategorical Columns:", list(categorical_cols))
print("Numerical Columns:", list(numerical_cols))


Categorical Columns: ['Gender', 'City', 'Education', 'Marital_Status', 'Occupation', 'Device_Type', 'Subscription_Type', 'Blood_Pressure']
Numerical Columns: ['ID', 'Age', 'Income', 'Loan_Amount', 'Credit_Score', 'Spending_Score', 'Internet_Usage', 'Health_Score', 'Exercise_Hours', 'Sleep_Hours', 'Stress_Level', 'Social_Media_Usage', 'Mobile_Usage', 'Work_Hours', 'Screen_Time', 'Depression_Score', 'Anxiety_Level', 'Job_Satisfaction', 'Happiness_Score', 'Physical_Activity', 'Heart_Rate']


In [10]:
# Step 3: Define the target column (optional)
target_column = "Diabetes_Risk"  # Change if needed
if target_column in data.columns:
    X = data.drop([target_column, "Name"], axis=1, errors="ignore")  # Drop non-essential columns
    y = data[target_column]
else:
    X = data.drop(["Name"], axis=1, errors="ignore")  # No target column
    y = None

print("\nFeatures Before Processing:\n", X.head())


Features Before Processing:
    ID  Age  Gender           City  Income Education Marital_Status  \
0   1   25  Female       New York   50000  Bachelor         Single   
1   2   30    Male    Los Angeles   60000    Master        Married   
2   3   35    Male        Chicago   70000       PhD        Married   
3   4   40  Female  San Francisco   55000  Bachelor       Divorced   
4   5   28    Male        Seattle   62000    Master         Single   

          Occupation  Loan_Amount  Credit_Score  ...  Mobile_Usage  \
0           Engineer        10000           750  ...             6   
1             Doctor        15000           800  ...             7   
2          Scientist        20000           820  ...             5   
3             Artist        12000           770  ...             8   
4  Software Engineer        18000           780  ...             6   

   Work_Hours Screen_Time Depression_Score  Anxiety_Level  Job_Satisfaction  \
0           8           5                2       

In [11]:
# Step 4: Define preprocessing steps
num_imputer = SimpleImputer(strategy="mean")  # Fill missing numerical values with mean
cat_imputer = SimpleImputer(strategy="most_frequent")  # Fill missing categorical values with most frequent
encoder = OneHotEncoder(handle_unknown="ignore")  # Convert categorical values to numeric
scaler = StandardScaler()  # Standardize numerical values

In [12]:
# Step 5: Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", num_imputer), ("scaler", scaler)]), numerical_cols),
        ("cat", Pipeline([("imputer", cat_imputer), ("encoder", encoder)]), categorical_cols),
    ]
)

# Step 6: Apply the pipeline
pipeline = Pipeline([
    ("preprocessing", preprocessor)
])


In [13]:
# Step 7: Split data if target column exists
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("\nTraining Data Shape:", X_train.shape)
    print("Testing Data Shape:", X_test.shape)

    X_train_transformed = pipeline.fit_transform(X_train)
    X_test_transformed = pipeline.transform(X_test)

    # Convert to DataFrame
    processed_train_data = pd.DataFrame(X_train_transformed)
    processed_test_data = pd.DataFrame(X_test_transformed)

    # Save the processed data
    processed_train_data.to_csv("processed_train.csv", index=False)
    processed_test_data.to_csv("processed_test.csv", index=False)

    print("\nProcessed Training Data Sample:\n", processed_train_data.head())
    print("\nProcessed Testing Data Sample:\n", processed_test_data.head())

    print("\n✅ Pipeline executed successfully! Processed training and test data saved.")
else:
    # If no target column, apply transformations to the whole dataset
    processed_data = pipeline.fit_transform(X)
    processed_df = pd.DataFrame(processed_data)

    # Save the transformed data
    processed_df.to_csv("processed_data.csv", index=False)

    print("\nProcessed Data Sample:\n", processed_df.head())
    print("\n✅ Pipeline executed successfully! Processed data saved.")


Training Data Shape: (8, 29)
Testing Data Shape: (2, 29)

Processed Training Data Sample:
          0         1         2         3         4         5         6   \
0  0.185695  0.038490 -0.268024 -0.902794 -1.194476 -1.118862 -1.375048   
1 -1.671258 -1.193191 -1.277056 -1.109147 -0.670870  0.946729  0.825029   
2  0.928477 -0.885270 -0.772540 -0.283735 -0.278166 -0.774597 -0.641689   
3 -0.928477  0.346410  1.245524  0.954382  1.161751  0.258199  0.091670   
4  1.671258 -0.577350 -0.394153 -0.490088 -0.932673 -1.463127 -1.375048   

         7         8         9   ...   49   50   51   52   53   54   55   56  \
0 -1.097260 -1.441153 -1.341641  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1  0.399004  0.480384  0.447214  ...  1.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0   
2 -0.299253 -0.160128 -0.447214  ...  0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0   
3  0.698257 -0.160128  1.341641  ...  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0   
4 -1.596015 -0.800641 -1.341641  ...  0.0  0.0  0.0  0.0 