In [36]:
import pandas as pd

# Example DataFrames
customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'region': ['East', 'West', 'East', 'South']
})

orders = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105],
    'customer_id': [1, 2, 2, 5, 3],
    'amount': [50.0, 120.0, 80.0, 30.0, 90.0]
})


merged_df = pd.merge(customers, orders,
                     on='customer_id',
                     how='outer',
                     indicator=True) 

print(merged_df)

   customer_id     name region  order_id  amount      _merge
0            1    Alice   East     101.0    50.0        both
1            2      Bob   West     102.0   120.0        both
2            2      Bob   West     103.0    80.0        both
3            3  Charlie   East     105.0    90.0        both
4            4    David  South       NaN     NaN   left_only
5            5      NaN    NaN     104.0    30.0  right_only


In [37]:
from os.path import join
import pandas as pd

customers = pd.DataFrame({
    "customer_id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Charlie", "David"],
    "region": ["East", "West", "East", "South"]
})

orders = pd.DataFrame({
    "order_id": [101, 102, 103, 104, 105],
    "customer_id": [1, 2, 2, 5, 3],
    "amount": [50.0, 120.0, 80.0, 30.0, 90.0]
})

join_df = customers.set_index("customer_id").join(
    orders.set_index("customer_id"),
    how="right"
).reset_index()

print(join_df)


   customer_id     name region  order_id  amount
0            1    Alice   East       101    50.0
1            2      Bob   West       102   120.0
2            2      Bob   West       103    80.0
3            5      NaN    NaN       104    30.0
4            3  Charlie   East       105    90.0


In [38]:
# Data prepared for pivoting: Sales by Product and Month
data = {
    'Month': [1, 1, 2, 2, 3, 3],
    'Product': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 50, 110, 60, 150, 70]
}
df_sales = pd.DataFrame(data)

# Pivot: Turn products into columns, aggregated by month
pivoted_df = df_sales.pivot_table(
    index='Month',
    columns='Product',
    values='Sales',
    aggfunc='sum'
)

print(pivoted_df)

Product    A   B
Month           
1        100  50
2        110  60
3        150  70


In [39]:
# Melting the pivoted_df back into a long format
melted_df = pd.melt(pivoted_df.reset_index(),
                    id_vars=['Month'],
                    value_vars=['A', 'B'],
                    var_name='Product',
                    value_name='Sales')

print(melted_df)

   Month Product  Sales
0      1       A    100
1      2       A    110
2      3       A    150
3      1       B     50
4      2       B     60
5      3       B     70


In [40]:
import pandas as pd
import numpy as np

data = {'A': [1, 2, np.nan, 4], 
        'B': [5, np.nan, 7, 8], 
        'C': [9, 10, 11, np.nan]}
df = pd.DataFrame(data)

# 1. Total count of missing values per column
print('Missing Counts:\n', df.isnull().sum())

# 2. Percentage of missing values
print('\nMissing Percentage:\n', (df.isnull().sum() / len(df)) * 100)

Missing Counts:
 A    1
B    1
C    1
dtype: int64

Missing Percentage:
 A    25.0
B    25.0
C    25.0
dtype: float64


In [41]:
from sklearn.impute import SimpleImputer
import pandas as pd

# 1. Impute Column A (Numeric) with the Median
median_A = df['A'].median()
df['A_imputed_median'] = df['A'].fillna(median_A)

# 2. Impute Column C (Time Series) using Forward Fill (FFill)
df['C_imputed_ffill'] = df['C'].ffill()

print(df)


     A    B     C  A_imputed_median  C_imputed_ffill
0  1.0  5.0   9.0               1.0              9.0
1  2.0  NaN  10.0               2.0             10.0
2  NaN  7.0  11.0               2.0             11.0
3  4.0  8.0   NaN               4.0             11.0


In [42]:
# Using Scikit-learn to handle multiple columns with different strategies
imputer_mean = SimpleImputer(strategy='mean')

# Fit and transform (A and B are imputed with their respective means)
df[['A', 'B']] = imputer_mean.fit_transform(df[['A', 'B']])

print('\nDataFrame after sklearn mean imputation:')
print(df)


DataFrame after sklearn mean imputation:
          A         B     C  A_imputed_median  C_imputed_ffill
0  1.000000  5.000000   9.0               1.0              9.0
1  2.000000  6.666667  10.0               2.0             10.0
2  2.333333  7.000000  11.0               2.0             11.0
3  4.000000  8.000000   NaN               4.0             11.0


In [43]:
import pandas as pd

data = {
    'City': ['NYC', 'London', 'Paris', 'NYC'],
    'Price': [100, 200, 300, 150]
}
df = pd.DataFrame(data)

# Perform OHE
df_ohe = pd.get_dummies(df, columns=['City'], prefix='is')

print('One-Hot Encoded Data:')
print(df_ohe)

One-Hot Encoded Data:
   Price  is_London  is_NYC  is_Paris
0    100      False    True     False
1    200       True   False     False
2    300      False   False      True
3    150      False    True     False


In [44]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

data = {
    'Size': ['Small', 'Medium', 'Large', 'Small'],
    'Value': [10, 20, 30, 15]
}
df_ord = pd.DataFrame(data)

# Define the explicit order
size_order = ['Small', 'Medium', 'Large']

encoder = OrdinalEncoder(categories=[size_order])

# Fit and transform the 'Size' column
df_ord['Size_Encoded'] = encoder.fit_transform(df_ord[['Size']])

print('\nOrdinal Encoded Data:')
print(df_ord)


Ordinal Encoded Data:
     Size  Value  Size_Encoded
0   Small     10           0.0
1  Medium     20           1.0
2   Large     30           2.0
3   Small     15           0.0


In [45]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

data = np.array([[10, 1], [20, 5], [30, 10]])

# 1. Normalization (MinMaxScaler)
scaler_norm = MinMaxScaler()
data_normalized = scaler_norm.fit_transform(data)
print('Normalized (Min-Max):')
print(data_normalized.round(2))

# 2. Standardization (StandardScaler)
scaler_std = StandardScaler()
data_standardized = scaler_std.fit_transform(data)
print('\nStandardized (Z-Score):')
print(data_standardized.round(2))

Normalized (Min-Max):
[[0.   0.  ]
 [0.5  0.44]
 [1.   1.  ]]

Standardized (Z-Score):
[[-1.22 -1.18]
 [ 0.   -0.09]
 [ 1.22  1.27]]


In [46]:
# 1. Optimizing Integer and Float Types

def downcast_numeric(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        # Check limits and downcast to the smallest fit
        if 'int' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif 'float' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# Create a sample DataFrame for demonstration
import pandas as pd
data = {
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'B': [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1],
    'C': ['apple', 'banana', 'apple', 'orange', 'banana', 'apple', 'grape', 'orange', 'banana', 'apple'],
    'D': ['long string data', 'more long string data', 'long string data', 'unique string', 'more long string data', 'long string data', 'some other text', 'unique string', 'more long string data', 'long string data']
}
df = pd.DataFrame(data)

print("Original Dtypes:")
print(df.dtypes)
print(f"Original Memory Usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

# 2. Optimizing String Types (Critical for low-cardinality data)

# Column 'C' has only 5 unique values (low cardinality)
df['C'] = df['C'].astype('category')

df_optimized = downcast_numeric(df.copy())

# Re-check memory usage
optimized_mem = df_optimized.memory_usage(deep=True).sum() / (1024**2)

print('Optimized Dtypes:')
print(df_optimized.dtypes)
print(f"Optimized Memory Usage: {optimized_mem:.2f} MB")

Original Dtypes:
A      int64
B    float64
C        str
D        str
dtype: object
Original Memory Usage: 0.00 MB
Optimized Dtypes:
A        int8
B     float32
C    category
D         str
dtype: object
Optimized Memory Usage: 0.00 MB


In [47]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# 1. Sample Data (Simulating raw input)
data = {
    'Age': [30, 45, np.nan, 22, 60],
    'Income': [50000, 120000, 80000, 30000, 150000],
    'City': ['NYC', 'London', 'Paris', 'NYC', 'London'],
    'Target': [0, 1, 0, 1, 1]
}
df = pd.DataFrame(data)
X = df.drop('Target', axis=1)
y = df['Target']

# --- Step 1: Define Column Groups ---

numerical_features = ['Age', 'Income']
categorical_features = ['City']

# --- Step 2: Define Sub-Pipelines ---

# Pipeline for Numerical Data (Impute missing, then scale)
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for Categorical Data (Handle missing, then OHE)
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))

])

# --- Step 3: Combine Pipelines using ColumnTransformer ---

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

# --- Step 4: Final Model Pipeline ---

# The final pipeline integrates preprocessing and the model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

# --- Step 5: Training ---

# The entire cleaning/scaling/training process is run in one line
full_pipeline.fit(X, y)

print("Pipeline training successful.")

# --- Step 6: Prediction on New Data ---

new_data = pd.DataFrame({
    'Age': [40, np.nan],
    'Income': [60000, 95000],
    'City': ['Paris', 'Berlin']
})

# The exact scaling and imputation rules learned from the training data are applied
predictions = full_pipeline.predict(new_data)

print(f"\nPredictions on new data: {predictions}")

Pipeline training successful.

Predictions on new data: [0 1]
