# Cleaning data

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Sample Data
data = {'Product': ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Phone', 'Keyboard'],
        'Price': [1000, 500, np.nan, 300, 600, None],
        'Rating': [4.5, np.nan, 4.2, 4.0, 4.7, 3.8],
        'Stock': [np.nan, 20, 15, 10, None, 25]}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# Fill numeric missing values with column mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# KNN Imputation
knn_imputer = KNNImputer(n_neighbors=2)
df[['Price', 'Rating', 'Stock']] = knn_imputer.fit_transform(df[['Price', 'Rating', 'Stock']])

print("\nCleaned Data:\n", df)

Original Data:
     Product   Price  Rating  Stock
0    Laptop  1000.0     4.5    NaN
1     Phone   500.0     NaN   20.0
2    Tablet     NaN     4.2   15.0
3   Monitor   300.0     4.0   10.0
4     Phone   600.0     4.7    NaN
5  Keyboard     NaN     3.8   25.0

Cleaned Data:
     Product   Price  Rating  Stock
0    Laptop  1000.0    4.50   17.5
1     Phone   500.0    4.24   20.0
2    Tablet   600.0    4.20   15.0
3   Monitor   300.0    4.00   10.0
4     Phone   600.0    4.70   17.5
5  Keyboard   600.0    3.80   25.0


# Data Integration

In [2]:
import pandas as pd

# Dataset 1: Sales Data
df_sales = pd.DataFrame({
    'Product_ID': [1, 2, 3, 4],
    'Product': ['Laptop', 'Phone', 'Tablet', 'Monitor'],
    'Price': [1000, 500, 700, 300]
})

# Dataset 2: Reviews Data
df_reviews = pd.DataFrame({
    'Product_ID': [1, 2, 3, 5],
    'Rating': [4.5, 4.2, 4.8, 4.3]
})

# Dataset 3: Stock Data
df_stock = pd.DataFrame({
    'Product_ID': [1, 2, 4, 5],
    'Stock_Available': [15, 30, 10, 5]
})

# Merging Sales and Reviews (Outer Join to include all products)
df_merged = pd.merge(df_sales, df_reviews, on='Product_ID', how='outer')

# Merging the Result with Stock Data
df_merged = pd.merge(df_merged, df_stock, on='Product_ID', how='outer')

# Display Final Merged Data
print("Integrated Data:\n", df_merged)

Integrated Data:
    Product_ID  Product   Price  Rating  Stock_Available
0           1   Laptop  1000.0     4.5             15.0
1           2    Phone   500.0     4.2             30.0
2           3   Tablet   700.0     4.8              NaN
3           4  Monitor   300.0     NaN             10.0
4           5      NaN     NaN     4.3              5.0


# Data normalisation

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Sample Data
data = {'Product': ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard'],
        'Price': [1000, 500, 700, 300, 100],
        'Rating': [4.5, 4.2, 4.8, 4.0, 3.8],
        'Stock': [15, 30, 10, 5, 20]}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# *1. Min-Max Normalization (Scaling between 0 and 1)*
scaler = MinMaxScaler()
df[['Price_Norm', 'Rating_Norm', 'Stock_Norm']] = scaler.fit_transform(df[['Price', 'Rating', 'Stock']])

# *2. Z-Score Normalization (Standardization)*
standard_scaler = StandardScaler()
df[['Price_Std', 'Rating_Std', 'Stock_Std']] = standard_scaler.fit_transform(df[['Price', 'Rating', 'Stock']])

print("\nNormalized Data:\n", df)

Original Data:
     Product  Price  Rating  Stock
0    Laptop   1000     4.5     15
1     Phone    500     4.2     30
2    Tablet    700     4.8     10
3   Monitor    300     4.0      5
4  Keyboard    100     3.8     20

Normalized Data:
     Product  Price  Rating  Stock  Price_Norm  Rating_Norm  Stock_Norm  \
0    Laptop   1000     4.5     15    1.000000          0.7         0.4   
1     Phone    500     4.2     30    0.444444          0.4         1.0   
2    Tablet    700     4.8     10    0.666667          1.0         0.2   
3   Monitor    300     4.0      5    0.222222          0.2         0.0   
4  Keyboard    100     3.8     20    0.000000          0.0         0.6   

   Price_Std  Rating_Std  Stock_Std  
0   1.536443    0.675053  -0.116248  
1  -0.064018   -0.168763   1.627467  
2   0.576166    1.518869  -0.697486  
3  -0.704203   -0.731307  -1.278724  
4  -1.344387   -1.293851   0.464991  
