In [None]:
import pandas as pd

# Example DataFrames
customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'region': ['East', 'West', 'East', 'South']
})

orders = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105],
    'customer_id': [1, 2, 2, 5, 3],
    'amount': [50.0, 120.0, 80.0, 30.0, 90.0]
})


merged_df = pd.merge(customers, orders,
                     on='customer_id',
                     how='outer',
                     indicator=True) 

print(merged_df)

: 

In [None]:
from os.path import join
import pandas as pd

customers = pd.DataFrame({
    "customer_id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Charlie", "David"],
    "region": ["East", "West", "East", "South"]
})

orders = pd.DataFrame({
    "order_id": [101, 102, 103, 104, 105],
    "customer_id": [1, 2, 2, 5, 3],
    "amount": [50.0, 120.0, 80.0, 30.0, 90.0]
})

join_df = customers.set_index("customer_id").join(
    orders.set_index("customer_id"),
    how="right"
).reset_index()

print(join_df)


In [None]:
# Data prepared for pivoting: Sales by Product and Month
data = {
    'Month': [1, 1, 2, 2, 3, 3],
    'Product': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 50, 110, 60, 150, 70]
}
df_sales = pd.DataFrame(data)

# Pivot: Turn products into columns, aggregated by month
pivoted_df = df_sales.pivot_table(
    index='Month',
    columns='Product',
    values='Sales',
    aggfunc='sum'
)

print(pivoted_df)

In [None]:
# Melting the pivoted_df back into a long format
melted_df = pd.melt(pivoted_df.reset_index(),
                    id_vars=['Month'],
                    value_vars=['A', 'B'],
                    var_name='Product',
                    value_name='Sales')

print(melted_df)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

data = np.array([[10, 1], [20, 5], [30, 10]])

# 1. Normalization (MinMaxScaler)
scaler_norm = MinMaxScaler()
data_normalized = scaler_norm.fit_transform(data)
print('Normalized (Min-Max):')
print(data_normalized.round(2))

# 2. Standardization (StandardScaler)
scaler_std = StandardScaler()
data_standardized = scaler_std.fit_transform(data)
print('\nStandardized (Z-Score):')
print(data_standardized.round(2))

In [None]:
# 1. Optimizing Integer and Float Types

def downcast_numeric(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        # Check limits and downcast to the smallest fit
        if 'int' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif 'float' in str(df[col].dtype):
            df[col] = pd.to_numeric(df[col], downcast='float')
    return df

# Create a sample DataFrame for demonstration
import pandas as pd
data = {
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'B': [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1],
    'C': ['apple', 'banana', 'apple', 'orange', 'banana', 'apple', 'grape', 'orange', 'banana', 'apple'],
    'D': ['long string data', 'more long string data', 'long string data', 'unique string', 'more long string data', 'long string data', 'some other text', 'unique string', 'more long string data', 'long string data']
}
df = pd.DataFrame(data)

print("Original Dtypes:")
print(df.dtypes)
print(f"Original Memory Usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

# 2. Optimizing String Types (Critical for low-cardinality data)

# Column 'C' has only 5 unique values (low cardinality)
df['C'] = df['C'].astype('category')

df_optimized = downcast_numeric(df.copy())

# Re-check memory usage
optimized_mem = df_optimized.memory_usage(deep=True).sum() / (1024**2)

print('Optimized Dtypes:')
print(df_optimized.dtypes)
print(f"Optimized Memory Usage: {optimized_mem:.2f} MB")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# 1. Sample Data (Simulating raw input)
data = {
    'Age': [30, 45, np.nan, 22, 60],
    'Income': [50000, 120000, 80000, 30000, 150000],
    'City': ['NYC', 'London', 'Paris', 'NYC', 'London'],
    'Target': [0, 1, 0, 1, 1]
}
df = pd.DataFrame(data)
X = df.drop('Target', axis=1)
y = df['Target']

# --- Step 1: Define Column Groups ---

numerical_features = ['Age', 'Income']
categorical_features = ['City']

# --- Step 2: Define Sub-Pipelines ---

# Pipeline for Numerical Data (Impute missing, then scale)
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for Categorical Data (Handle missing, then OHE)
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))

])

# --- Step 3: Combine Pipelines using ColumnTransformer ---

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

# --- Step 4: Final Model Pipeline ---

# The final pipeline integrates preprocessing and the model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear'))
])

# --- Step 5: Training ---

# The entire cleaning/scaling/training process is run in one line
full_pipeline.fit(X, y)

print("Pipeline training successful.")

# --- Step 6: Prediction on New Data ---

new_data = pd.DataFrame({
    'Age': [40, np.nan],
    'Income': [60000, 95000],
    'City': ['Paris', 'Berlin']
})

# The exact scaling and imputation rules learned from the training data are applied
predictions = full_pipeline.predict(new_data)

print(f"\nPredictions on new data: {predictions}")