# Code-Based Questions: Assignment 3
## 2.1.1 Advanced Data Wrangling with Pandas

### Q41. Write Pandas code to merge two datasets using inner and outer joins.

In [19]:
import pandas as pd

df1 = pd.DataFrame({'key': [1, 2, 3], 'val1': ['A', 'B', 'C']})
df2 = pd.DataFrame({'key': [2, 3, 4], 'val2': ['X', 'Y', 'Z']})

inner_merge = pd.merge(df1, df2, on='key', how='inner')
outer_merge = pd.merge(df1, df2, on='key', how='outer')

print("Inner Join Results:")
print(inner_merge)
print("\nOuter Join Results:")
print(outer_merge)

Inner Join Results:
   key val1 val2
0    2    B    X
1    3    C    Y

Outer Join Results:
   key val1 val2
0    1    A  NaN
1    2    B    X
2    3    C    Y
3    4  NaN    Z


### Q42. Given a dataset, create a pivot table showing total sales per category.

In [20]:
sales_df = pd.DataFrame({
    'Category': ['Electronics', 'Clothing', 'Electronics', 'Home', 'Clothing'],
    'Sales': [1000, 500, 1200, 800, 600]
})

pivot = sales_df.pivot_table(values='Sales', index='Category', aggfunc='sum')
print("Pivot Table (Total Sales per Category):")
print(pivot)

Pivot Table (Total Sales per Category):
             Sales
Category          
Clothing      1100
Electronics   2200
Home           800


### Q43. Reshape a DataFrame from wide format to long format using melt().

In [21]:
wide_df = pd.DataFrame({
    'ID': [1, 2],
    'Math': [90, 80],
    'Science': [85, 88]
})

long_df = wide_df.melt(id_vars=['ID'], var_name='Subject', value_name='Score')
print("Melted DataFrame (Long Format):")
print(long_df)

Melted DataFrame (Long Format):
   ID  Subject  Score
0   1     Math     90
1   2     Math     80
2   1  Science     85
3   2  Science     88


### Q44. Write Python code to detect and handle missing values.

In [22]:
import numpy as np
df_missing = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8]})

print("Missing counts:\n", df_missing.isnull().sum())

# Handle: Fill A with mean, drop rows from B if too many nulls
df_missing['A'] = df_missing['A'].fillna(df_missing['A'].mean())
df_cleaned = df_missing.fillna(0)
print("\nHandled DataFrame:")
print(df_cleaned)

Missing counts:
 A    1
B    2
dtype: int64

Handled DataFrame:
          A    B
0  1.000000  5.0
1  2.000000  0.0
2  2.333333  0.0
3  4.000000  8.0


### Q45. Encode categorical variables using one-hot encoding.

In [23]:
df_cat = pd.DataFrame({'City': ['Berlin', 'Paris', 'Berlin', 'Madrid']})
one_hot = pd.get_dummies(df_cat, columns=['City'])
print("One-Hot Encoded cities:")
print(one_hot)

One-Hot Encoded cities:
   City_Berlin  City_Madrid  City_Paris
0         True        False       False
1        False        False        True
2         True        False       False
3        False         True       False


### Q46. Convert a column into datetime format and perform time-based indexing.

In [24]:
df_time = pd.DataFrame({
    'date_str': ['2023-01-01', '2023-01-02', '2023-01-03'],
    'value': [10, 20, 30]
})

df_time['datetime'] = pd.to_datetime(df_time['date_str'])
df_time.set_index('datetime', inplace=True)
print("Time-based Indexed Data:")
print(df_time.loc['2023-01-02'])

Time-based Indexed Data:
date_str    2023-01-02
value               20
Name: 2023-01-02 00:00:00, dtype: object


### Q47. Apply forward fill and backward fill on time-series data.

In [25]:
ts_data = pd.Series([1, np.nan, np.nan, 4, 5])
ffill = ts_data.ffill()
bfill = ts_data.bfill()

print("Forward Fill:\n", ffill)
print("\nBackward Fill:\n", bfill)

Forward Fill:
 0    1.0
1    1.0
2    1.0
3    4.0
4    5.0
dtype: float64

Backward Fill:
 0    1.0
1    4.0
2    4.0
3    4.0
4    5.0
dtype: float64


### Q48. Normalize and standardize numerical features using Python.

In [26]:
nums = pd.Series([10, 20, 30, 40, 50])

# Normalization (Min-Max Scaling)
normalized = (nums - nums.min()) / (nums.max() - nums.min())

# Standardization (Z-score)
standardized = (nums - nums.mean()) / nums.std()

print("Normalized:\n", normalized)
print("\nStandardized:\n", standardized)

Normalized:
 0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

Standardized:
 0   -1.264911
1   -0.632456
2    0.000000
3    0.632456
4    1.264911
dtype: float64


### Q49. Write code to reduce memory usage of a DataFrame.

In [27]:
def reduce_mem_usage(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        if df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

big_df = pd.DataFrame({'a': range(1000), 'b': [1.1]*1000})
print("Before Memory:", big_df.memory_usage(deep=True).sum())
reduced_df = reduce_mem_usage(big_df)
print("After Memory:", reduced_df.memory_usage(deep=True).sum())

Before Memory: 16132
After Memory: 6132


### Q50. Process a large CSV file using chunking in Pandas.

In [28]:
# Demonstrating chunking pattern
def process_chunks(filename):
    total_rows = 0
    # for chunk in pd.read_csv(filename, chunksize=500):
    #     total_rows += len(chunk)
    # return total_rows
    print("Chunking implementation: pd.read_csv(file, chunksize=N)")

process_chunks('dummy.csv')

Chunking implementation: pd.read_csv(file, chunksize=N)


### Q51. Write a reusable data-cleaning function in Python.

In [29]:
def clean_data(df):
    df = df.drop_duplicates()
    df = df.fillna(df.mean(numeric_only=True))
    # Strip whitespace from strings
    for col in df.select_dtypes(include=['object']): 
        df[col] = df[col].str.strip()
    return df

print("Standard reusable cleaner defined.")

Standard reusable cleaner defined.


### Q52. Implement a simple data-cleaning pipeline using Pandas.

In [30]:
def data_pipeline(df):
    return (df
            .pipe(lambda x: x.drop_duplicates())
            .pipe(lambda x: x.fillna(0))
            .pipe(lambda x: x.rename(columns=lambda c: c.lower()))
           )

print("DataFrame pipeline defined.")

DataFrame pipeline defined.


### Q53. Write code to ingest data from CSV, database, and API sources.

In [31]:
import sqlite3
import requests

# 1. CSV
# csv_data = pd.read_csv('file.csv')

# 2. Database
conn = sqlite3.connect(':memory:')
db_data = pd.read_sql('SELECT * FROM sqlite_master', conn)

# 3. API
# api_resp = requests.get('https://api.example.com/data')
# api_data = pd.DataFrame(api_resp.json())

print("Ingestion methods defined.")

Ingestion methods defined.


### Q54. Merge multiple datasets into a unified table.

In [32]:
dfs = [
    pd.DataFrame({'ID': [1], 'A': [10]}),
    pd.DataFrame({'ID': [1], 'B': [20]}),
    pd.DataFrame({'ID': [1], 'C': [30]})
]

unified = dfs[0]
for d in dfs[1:]: 
    unified = pd.merge(unified, d, on='ID')

print("Unified Table:")
print(unified)

Unified Table:
   ID   A   B   C
0   1  10  20  30


### Q55. Implement RFM (Recency, Frequency, Monetary) analysis using Pandas.

In [33]:
from datetime import datetime
orders = pd.DataFrame({
    'CustomerID': [1, 1, 2],
    'OrderDate': pd.to_datetime(['2023-10-01', '2023-12-01', '2023-11-15']),
    'Amount': [100, 150, 200]
})

snapshot_date = datetime(2023, 12, 31)
rfm = orders.groupby('CustomerID').agg({
    'OrderDate': lambda x: (snapshot_date - x.max()).days,
    'CustomerID': 'count',
    'Amount': 'sum'
}).rename(columns={'OrderDate': 'Recency', 'CustomerID': 'Frequency', 'Amount': 'Monetary'})

print("RFM Analysis Table:")
print(rfm)

RFM Analysis Table:
            Recency  Frequency  Monetary
CustomerID                              
1                30          2       250
2                46          1       200


### Q56. Compute Customer Lifetime Value (CLV).

In [34]:
# Simple CLV: Monthly Revenue * Average Lifespan
rfm['CLV'] = rfm['Monetary'] * 12 # Simplified example for 1 year
print("CLV Computation:")
print(rfm[['CLV']])

CLV Computation:
             CLV
CustomerID      
1           3000
2           2400


### Q57. Store processed data into a CSV file or database table.

In [35]:
# rfm.to_csv('processed_rfm.csv')
# rfm.to_sql('rfm_analysis', conn, if_exists='replace')
print("Storage logic: df.to_csv() or df.to_sql()")

Storage logic: df.to_csv() or df.to_sql()


### Q58. Design a simple end-to-end data pipeline using Python

In [36]:
def end_to_end_pipeline(source_path, target_path):
    # 1. Ingest
    # df = pd.read_csv(source_path)
    # 2. Process
    # df_clean = clean_data(df)
    # 3. Store
    # df_clean.to_csv(target_path)
    print("Full ETL Pipeline Architecture defined.")

end_to_end_pipeline('raw.csv', 'clean.csv')

Full ETL Pipeline Architecture defined.
