In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the data
file_path = '/content/drive/MyDrive/IIT BHU - DS Assessment dataset/data.csv'
data = pd.read_csv(file_path)

In [None]:
# 1. Handle Missing Values
# Fill missing numerical values with the median
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# Fill missing categorical values with the most frequent value
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

In [None]:
# 2. Feature Extraction
# Example features: Length of search term, whether the search term is popular (based on frequency)
data['search_term_length'] = data['search_term'].apply(len)
data['is_popular_search_term'] = data['search_term'].isin(data['search_term'].value_counts().head(20).index).astype(int)

In [None]:

# 3. Encode Categorical Variables
# Using OneHotEncoder for categorical columns
# Remove the target variable from numerical and categorical columns
target = 'is_clicked' # Define the target variable here
numerical_cols = numerical_cols.drop(target, errors='ignore')  # Ignore if target is not in the list
categorical_cols = categorical_cols.drop(target, errors='ignore')  # Ignore if target is not in the list

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [None]:
# 4. Target Variable
X = data.drop(columns=[target])
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the preprocessing pipeline
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [None]:
# 5. Build a Simple Prediction Model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Feature Importance (for linear models)
importance = model.coef_
# Adjust feature names after removing target from numerical and categorical columns
feature_names = numerical_cols.tolist() + preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols).tolist()
feature_importance = pd.Series(importance, index=feature_names).sort_values(ascending=False)

print("Feature Importance:")
print(feature_importance)

Mean Squared Error: 0.12659483881126682
Feature Importance:
query_type_head                                                                                          27.737006
query_type_tail                                                                                          27.486191
query_type_5.342857142857146                                                                             18.541543
query_type_54.01                                                                                         18.429675
predicted_category_name_{'Cleaning Essentials': 1, 'Electricals & Accessories': 0.5, 'Zepto Cafe': 2}    17.673990
                                                                                                           ...    
query_type_82.537090909091                                                                              -17.933973
product_variant_id_Breakfast & Sauces                                                                   -55.223197
city_id_Honey & Spre

In [None]:
"""# Retrieve column names from the transformers
num_cols_transformed = numerical_cols
# Access OneHotEncoder using 'onehot'
cat_cols_transformed = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)

all_cols_transformed = np.concatenate([num_cols_transformed, cat_cols_transformed])

print(len(num_cols_transformed))
print(len(cat_cols_transformed))
print(len(all_cols_transformed))"""

"# Retrieve column names from the transformers\nnum_cols_transformed = numerical_cols\n# Access OneHotEncoder using 'onehot'\ncat_cols_transformed = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)\n\nall_cols_transformed = np.concatenate([num_cols_transformed, cat_cols_transformed])\n\nprint(len(num_cols_transformed))\nprint(len(cat_cols_transformed))\nprint(len(all_cols_transformed))"

In [None]:

# Option 1: Remove rows with any missing values
data_dropped = data.dropna()

# Option 2: Fill missing values (imputation)
# For numerical columns, fill with the median
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data_filled = data.copy()
data_filled[numerical_cols] = data_filled[numerical_cols].fillna(data_filled[numerical_cols].median())

# For categorical columns, fill with the most frequent value
categorical_cols = data.select_dtypes(include=['object']).columns
data_filled[categorical_cols] = data_filled[categorical_cols].fillna(data_filled[categorical_cols].mode().iloc[0])

# Save the cleaned data to a new CSV file
data_dropped.to_csv('/content/drive/MyDrive/IIT BHU - DS Assessment dataset/data_dropped.csv', index=False)
data_filled.to_csv('/content/drive/MyDrive/IIT BHU - DS Assessment dataset/data.csv', index=False)

print("Data with rows containing any missing values removed:")
print(data_dropped.head())

print("\nData with missing values filled:")
print(data_filled.head())



Data with rows containing any missing values removed:
           search_term                    product_variant_id  \
0    akshayakalpa milk  c442ad9b-09b6-4505-a17d-7d2b3cceee0e   
1            ice cubes  33e1c372-8f6b-4312-b4df-93911f4c1caf   
2              protien  8ed8801e-f3bd-46ed-a212-24dcaa33d937   
3                 comb  b16224dd-5b0b-4e39-91d0-b52e563c70c2   
4  nail polish remover  25efac15-9a86-4da1-ab36-94e32bc7ecd9   

                                city_id query_type  is_clicked  total_clicks  \
0  ee66dc2a-aded-4445-a7b2-1ad63715725c       head         0.0           0.0   
1  7e926d2f-adad-4e5a-956f-f07fffa54164       head         0.0           0.0   
2  078d5e32-627a-4907-8df8-4360bc7c06da       head         1.0          29.0   
3  4f30407c-6a3c-4a4e-8a3d-652217d4b6cb       head         0.0           0.0   
4  4f30407c-6a3c-4a4e-8a3d-652217d4b6cb       tail         0.0           0.0   

   session_views  query_products_clicks_last_30_days  CTR_last_30_days  \
0     

In [None]:
# Define chunk size
chunk_size = 10000

# Load the first chunk of data
file_path = '/content/drive/MyDrive/IIT BHU - DS Assessment dataset/data_dropped.csv'
chunks = pd.read_csv(file_path, chunksize=chunk_size)

# Identify numerical and categorical columns from the first chunk
first_chunk = next(chunks)
numerical_cols = first_chunk.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = first_chunk.select_dtypes(include=['object']).columns

# Define preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Process the first chunk
first_chunk_preprocessed = preprocessor.fit_transform(first_chunk)

# Initialize a list to collect processed chunks
processed_chunks = [pd.DataFrame(first_chunk_preprocessed, columns=np.concatenate([numerical_cols, preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols)]))]

# Process remaining chunks
for chunk in chunks:
    chunk_preprocessed = preprocessor.transform(chunk)
    processed_chunk = pd.DataFrame(chunk_preprocessed, columns=np.concatenate([numerical_cols, preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols)]))
    processed_chunks.append(processed_chunk)

# Concatenate all processed chunks
data_cleaned = pd.concat(processed_chunks, ignore_index=True)

# Save the cleaned data to a new CSV file
cleaned_file_path = '/content/drive/MyDrive/IIT BHU - DS Assessment dataset/data_dropped_cleaned.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)

# Correlation Analysis
correlation_matrix = data_cleaned.corr()

# Plot the correlation matrix
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Cleaned Data')
plt.show()

print("Cleaned and Preprocessed Data:")
print(data_cleaned.head())
