In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from src.data import BasicCleaner
from src.data import ContentBasedFeatures

from src.models.content_based import ContentBasedRecommender

import scipy

import sklearn.preprocessing

In [2]:
import os
import sys

In [3]:
os.path.abspath('../hadoop')

'I:\\learning_projects\\projects\\ecommerce\\hadoop'

In [4]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['HADOOP_HOME'] = '../hadoop'
os.environ['PATH'] = f"{os.environ['HADOOP_HOME']}/bin;" + os.environ['PATH']

In [5]:
spark = SparkSession.builder \
    .appName("EcommerceAnalysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.local.dir", "I:/spark_temp") \
    .config("spark.sql.shuffle.partitions", "10") \
    .getOrCreate()

In [6]:
df = spark.read.csv('../data/raw/2019-Oct.csv', header=True, inferSchema=True).cache()

In [7]:
cleaner = BasicCleaner()
df = cleaner.clean_df(df)

INFO:src.data.basic_cleaning:Cleaning DataFrame...
INFO:src.data.basic_cleaning:Removing duplicates from DataFrame...
INFO:src.data.basic_cleaning:Price filtering...
INFO:src.data.basic_cleaning:Handling missing category_code...
INFO:src.data.basic_cleaning:Handling missing brands...


In [None]:
small_df = df.limit(10000)
small_df.write.parquet('temp_data.parquet', mode='overwrite')

In [6]:
small_df = spark.read.parquet('temp_data.parquet')

In [7]:
cb_feature_transformer = ContentBasedFeatures(spark)
model = ContentBasedRecommender(spark)

In [8]:
df_train = cb_feature_transformer.create_features(small_df)

INFO:src.data.feature_engineering:Creating weighted category paths...
INFO:src.data.feature_engineering:Training Word2Vec model...
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 401 word types from a corpus of 401 raw words and 401 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 401 unique words (100.00% of original 401, drops 0)', 'datetime': '2024-12-25T12:44:41.009865', 'gensim': '4.3.3', 'python': '3.11.8 (tags/v3.11.8:db85d51, Feb  6 2024, 22:03:32) [MSC v.1937 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 401 word corpus (100.00% of original 401, drops 0)', 'datetime': '2024-12-25T12:44:41.010865', 'gensim': '4.3.3'

+-------------+------------------+
|category_code|category_embedding|
+-------------+------------------+
+-------------+------------------+

Feature columns: ['category_emb_0', 'category_emb_1', 'category_emb_2', 'category_emb_3', 'category_emb_4', 'category_emb_5', 'category_emb_6', 'category_emb_7', 'category_emb_8', 'category_emb_9', 'category_emb_10', 'category_emb_11', 'category_emb_12', 'category_emb_13', 'category_emb_14', 'category_emb_15', 'category_emb_16', 'category_emb_17', 'category_emb_18', 'category_emb_19', 'category_emb_20', 'category_emb_21', 'category_emb_22', 'category_emb_23', 'category_emb_24', 'category_emb_25', 'category_emb_26', 'category_emb_27', 'category_emb_28', 'category_emb_29', 'category_emb_30', 'category_emb_31', 'price_normalized']
Columns in df_aggregated: ['product_id', 'category_embedding', 'price_normalized', 'category_emb_0', 'category_emb_1', 'category_emb_2', 'category_emb_3', 'category_emb_4', 'category_emb_5', 'category_emb_6', 'category_emb_

In [14]:
df_train.filter(df_train.product_id == 5100816).count()

1

In [15]:
model.fit(df_train)

INFO:src.models.content_based:Preparing feature vectors...


Number of rows in DataFrame: 5854


INFO:src.models.content_based:Calculating similarity matrix...


Number of items collected: 5854
First few items: [(1002099, array([ 0.01956469, -0.01584572,  0.00852447,  0.00795345,  0.00749321,
        0.0260408 , -0.02160443, -0.00718572,  0.00315297,  0.01993049,
       -0.00202729,  0.02477537, -0.01453724, -0.02822312, -0.02204406,
       -0.0129303 ,  0.02994781, -0.0160227 , -0.01515113,  0.02554213,
       -0.02330058,  0.0062805 ,  0.01786264, -0.0235323 , -0.0008218 ,
       -0.00740059,  0.01739257, -0.01425141, -0.02063996,  0.03091168,
       -0.01444734,  0.02376332,  0.14348958])), (1002544, array([ 0.01956469, -0.01584572,  0.00852447,  0.00795345,  0.00749321,
        0.0260408 , -0.02160443, -0.00718572,  0.00315297,  0.01993049,
       -0.00202729,  0.02477537, -0.01453724, -0.02822312, -0.02204406,
       -0.0129303 ,  0.02994781, -0.0160227 , -0.01515113,  0.02554213,
       -0.02330058,  0.0062805 ,  0.01786264, -0.0235323 , -0.0008218 ,
       -0.00740059,  0.01739257, -0.01425141, -0.02063996,  0.03091168,
       -0.0144473

In [16]:
model.get_similar_products(5100816)

[(5100767, 0.9999902823970895),
 (5100768, 0.9999902823970895),
 (5100871, 0.9999838860754093),
 (5100781, 0.9999384049840955),
 (5100607, 0.9999077712579397)]

In [25]:
from scipy.stats import chi2_contingency
from sklearn.metrics import normalized_mutual_info_score
import numpy as np
import pandas as pd
from collections import defaultdict

def analyze_brand_category_relationships(df):
    """Analyze relationships between brands and categories"""
    
    # Standardize brand names to lowercase
    df = df.withColumn('brand_std', F.lower(F.col('brand')))
    
    # 1. Basic co-occurrence analysis
    brand_category_counts = df.groupBy('brand_std', 'category_code') \
        .count() \
        .orderBy('count', ascending=False)
    
    # Convert to pandas for easier analysis
    brand_cat_matrix = brand_category_counts.toPandas() \
        .pivot(index='brand_std', columns='category_code', values='count') \
        .fillna(0)
    
    # 2. Calculate brand category concentration
    brand_concentration = {}
    for brand in brand_cat_matrix.index:
        distribution = brand_cat_matrix.loc[brand]
        # Calculate Gini coefficient
        sorted_dist = np.sort(distribution[distribution > 0])
        n = len(sorted_dist)
        if n > 0:
            index = np.arange(1, n + 1)
            brand_concentration[brand] = ((np.sum((2 * index - n - 1) * sorted_dist)) / 
                                       (n * np.sum(sorted_dist)))
    
    # 3. Category exclusivity for brands
    brand_exclusivity = {}
    total_categories = brand_cat_matrix.astype(bool).sum(axis=1)
    for brand in brand_cat_matrix.index:
        top_category = brand_cat_matrix.loc[brand].idxmax()
        category_share = (brand_cat_matrix.loc[brand][top_category] / 
                        brand_cat_matrix.loc[brand].sum())
        brand_exclusivity[brand] = {
            'top_category': top_category,
            'category_share': category_share,
            'total_categories': total_categories[brand]
        }
    
    # 4. Calculate normalized mutual information
    # Convert Row objects to values properly
    brand_indices = {row['brand_std']: idx for idx, row in 
                    enumerate(df.select('brand_std').distinct().collect())}
    category_indices = {row['category_code']: idx for idx, row in 
                       enumerate(df.select('category_code').distinct().collect())}
    
    # Create arrays using proper Row object access
    brand_arr = [brand_indices[row['brand_std']] for row in df.select('brand_std').collect()]
    category_arr = [category_indices[row['category_code']] for row in df.select('category_code').collect()]
    
    nmi_score = normalized_mutual_info_score(brand_arr, category_arr)
    
    # Add some basic statistics
    stats = {
        'total_brands': len(brand_indices),
        'total_categories': len(category_indices),
        'avg_categories_per_brand': total_categories.mean(),
        'median_categories_per_brand': total_categories.median()
    }
    
    return {
        'brand_concentration': brand_concentration,
        'brand_exclusivity': brand_exclusivity,
        'nmi_score': nmi_score,
        'contingency_matrix': brand_cat_matrix,
        'stats': stats
    }

def get_top_category_brands(df, top_n=10):
    """Get top brands for each category"""
    return df.groupBy('category_code', 'brand_std') \
        .count() \
        .orderBy(['category_code', 'count'], ascending=[True, False]) \
        .groupBy('category_code') \
        .agg(F.collect_list(F.struct('brand_std', 'count')).alias('brands')) \
        .rdd.map(lambda x: (x.category_code, x.brands[:top_n])) \
        .collectAsMap()

def print_analysis_results(results):
    """Print readable analysis results"""
    print("=== Brand-Category Analysis Results ===")
    
    print("\nBasic Statistics:")
    print(f"Total unique brands: {results['stats']['total_brands']}")
    print(f"Total unique categories: {results['stats']['total_categories']}")
    print(f"Average categories per brand: {results['stats']['avg_categories_per_brand']:.2f}")
    print(f"Median categories per brand: {results['stats']['median_categories_per_brand']:.2f}")
    
    print("\n1. Overall Category-Brand Correlation:")
    print(f"Normalized Mutual Information Score: {results['nmi_score']:.4f}")
    
    print("\n2. Top 10 Most Category-Focused Brands:")
    sorted_brands = sorted(results['brand_concentration'].items(), 
                         key=lambda x: x[1], reverse=True)[:10]
    for brand, concentration in sorted_brands:
        print(f"{brand}: {concentration:.4f}")
    
    print("\n3. Category Exclusivity Analysis:")
    exclusive_brands = sorted(
        [(brand, info) for brand, info in results['brand_exclusivity'].items()
         if info['category_share'] > 0.2 and info['total_categories'] > 5],
        key=lambda x: x[1]['category_share'], 
        reverse=True
    )[:10]
    
    print("\nTop 10 Category-Exclusive Brands (with significant presence):")
    for brand, info in exclusive_brands:
        print(f"{brand}: {info['top_category']} ({info['category_share']:.2%} of activity)")

# Example usage
results = analyze_brand_category_relationships(small_df)
print_analysis_results(results)

=== Brand-Category Analysis Results ===

Basic Statistics:
Total unique brands: 867
Total unique categories: 368
Average categories per brand: 2.03
Median categories per brand: 1.00

1. Overall Category-Brand Correlation:
Normalized Mutual Information Score: 0.6250

2. Top 10 Most Category-Focused Brands:
xiaomi: 0.8600
samsung: 0.8402
apple: 0.8115
huawei: 0.7744
asus: 0.6894
hp: 0.6798
acer: 0.6792
unknown: 0.6666
respect: 0.6447
lenovo: 0.6368

3. Category Exclusivity Analysis:

Top 10 Category-Exclusive Brands (with significant presence):
huawei: electronics.smartphone (91.70% of activity)
asus: computers.notebook (81.82% of activity)
apple: electronics.smartphone (80.21% of activity)
acer: computers.notebook (73.12% of activity)
samsung: electronics.smartphone (72.75% of activity)
respect: apparel.shoes (72.37% of activity)
xiaomi: electronics.smartphone (68.99% of activity)
hp: computers.notebook (54.55% of activity)
beko: appliances.kitchen.refrigerators (46.15% of activity)
mid