# Smart Product Pricing Challenge - Data Exploration

This notebook explores the training dataset and implements preprocessing steps.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load training data
train_df = pd.read_csv('../dataset/train.csv')
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
train_df.head()


In [None]:
# Basic data info
print("Dataset Info:")
print(train_df.info())
print("\nMissing values:")
print(train_df.isnull().sum())
print("\nPrice statistics:")
print(train_df['price'].describe())


In [None]:
# Price distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(train_df['price'], bins=50, alpha=0.7)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.hist(np.log1p(train_df['price']), bins=50, alpha=0.7)
plt.title('Log Price Distribution')
plt.xlabel('Log(Price + 1)')
plt.ylabel('Frequency')

plt.subplot(1, 3, 3)
plt.boxplot(train_df['price'])
plt.title('Price Box Plot')
plt.ylabel('Price')

plt.tight_layout()
plt.show()


In [None]:
# Sample catalog content analysis
print("Sample catalog content:")
for i in range(3):
    print(f"\nSample {i+1}:")
    print(train_df.iloc[i]['catalog_content'])
    print(f"Price: {train_df.iloc[i]['price']}")


In [None]:
# Text preprocessing functions
def extract_ipq(text):
    """Extract Item Pack Quantity from text"""
    # Look for patterns like "Pack of 5", "5 Pack", "Quantity: 10", etc.
    patterns = [
        r'pack of (\d+)',
        r'(\d+) pack',
        r'quantity[\s:]+(\d+)',
        r'ipq[\s:]+(\d+)',
        r'(\d+)\s*x\s*(\d+)',  # For cases like "5 x 2"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            if len(match.groups()) == 2:  # For patterns like "5 x 2"
                return int(match.group(1)) * int(match.group(2))
            else:
                return int(match.group(1))
    
    return 1  # Default to 1 if not found

def clean_text(text):
    """Clean and preprocess text"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing
train_df['cleaned_text'] = train_df['catalog_content'].apply(clean_text)
train_df['ipq'] = train_df['catalog_content'].apply(extract_ipq)

print("IPQ extraction results:")
print(train_df['ipq'].value_counts().head(10))
print(f"\nAverage IPQ: {train_df['ipq'].mean():.2f}")
print(f"Max IPQ: {train_df['ipq'].max()}")


In [None]:
# Text length analysis
train_df['text_length'] = train_df['cleaned_text'].str.len()
train_df['word_count'] = train_df['cleaned_text'].str.split().str.len()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.scatter(train_df['text_length'], train_df['price'], alpha=0.5)
plt.title('Text Length vs Price')
plt.xlabel('Text Length')
plt.ylabel('Price')

plt.subplot(1, 2, 2)
plt.scatter(train_df['word_count'], train_df['price'], alpha=0.5)
plt.title('Word Count vs Price')
plt.xlabel('Word Count')
plt.ylabel('Price')

plt.tight_layout()
plt.show()


In [None]:
# Correlation analysis
numeric_features = ['ipq', 'text_length', 'word_count', 'price']
correlation_matrix = train_df[numeric_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()


In [None]:
# Save preprocessed data
train_df.to_csv('../dataset/train_preprocessed.csv', index=False)
print("Preprocessed training data saved!")
