# Data Analysis: Laptop E-Commerce

**Purpose**: Generate insights and visualizations

---
## 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
print("âœ“ Libraries imported!")

---
## 2. LaptopAnalysis Class

In [None]:
class LaptopAnalysis:
    def __init__(self, data_file):
        self.data_file = data_file
        self.df = None
        self.output_dir = "output"
        
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
    
    def load_data(self):
        """Load cleaned data from CSV"""
        print("=" * 60)
        print("Loading cleaned data for analysis...")
        print("=" * 60)
        
        try:
            self.df = pd.read_csv(self.data_file)
            print(f"âœ“ Loaded {len(self.df)} records")
            print(f"âœ“ Columns: {list(self.df.columns)}")
            return True
        except Exception as e:
            print(f"âœ— Error loading data: {e}")
            return False
    
    def analyze_price_distribution(self):
        """Analyze and visualize price distribution across brands"""
        print("\n" + "=" * 60)
        print("ANALYSIS 1: Price Distribution Analysis")
        print("=" * 60)
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Laptop Price Distribution Analysis', fontsize=16, fontweight='bold')
        
        # 1. Overall price distribution
        ax1 = axes[0, 0]
        ax1.hist(self.df['Price (USD)'], bins=30, color='#4c6ef5', edgecolor='black', alpha=0.7)
        ax1.axvline(self.df['Price (USD)'].mean(), color='red', linestyle='--', 
                    linewidth=2, label=f'Mean: ${self.df["Price (USD)"].mean():.2f}')
        ax1.axvline(self.df['Price (USD)'].median(), color='green', linestyle='--', 
                    linewidth=2, label=f'Median: ${self.df["Price (USD)"].median():.2f}')
        ax1.set_xlabel('Price (USD)', fontsize=11)
        ax1.set_ylabel('Frequency', fontsize=11)
        ax1.set_title('Overall Price Distribution', fontsize=13, fontweight='bold')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. Price by brand (box plot)
        ax2 = axes[0, 1]
        brand_order = self.df.groupby('Brand')['Price (USD)'].median().sort_values(ascending=False).index
        sns.boxplot(data=self.df, y='Brand', x='Price (USD)', order=brand_order, ax=ax2, palette="Set2")
        ax2.set_xlabel('Price (USD)', fontsize=11)
        ax2.set_ylabel('Brand', fontsize=11)
        ax2.set_title('Price Distribution by Brand', fontsize=13, fontweight='bold')
        ax2.grid(True, alpha=0.3, axis='x')
        
        # 3. Average price by brand
        ax3 = axes[1, 0]
        avg_price = self.df.groupby('Brand')['Price (USD)'].mean().sort_values(ascending=False)
        bars = ax3.barh(avg_price.index, avg_price.values, color='#51cf66', edgecolor='black')
        ax3.set_xlabel('Average Price (USD)', fontsize=11)
        ax3.set_ylabel('Brand', fontsize=11)
        ax3.set_title('Average Price by Brand', fontsize=13, fontweight='bold')
        
        # Add value labels
        for i, (bar, value) in enumerate(zip(bars, avg_price.values)):
            ax3.text(value, bar.get_y() + bar.get_height()/2, 
                    f' ${value:.2f}', 
                    va='center', fontsize=9, fontweight='bold')
        ax3.grid(True, alpha=0.3, axis='x')
        
        # 4. Price range by brand
        ax4 = axes[1, 1]
        price_stats = self.df.groupby('Brand')['Price (USD)'].agg(['min', 'max', 'mean'])
        price_stats['range'] = price_stats['max'] - price_stats['min']
        price_stats = price_stats.sort_values('range', ascending=False)
        
        x = np.arange(len(price_stats))
        width = 0.35
        
        bars1 = ax4.bar(x - width/2, price_stats['min'], width, label='Min Price', 
                       color='#ffd43b', edgecolor='black')
        bars2 = ax4.bar(x + width/2, price_stats['max'], width, label='Max Price', 
                       color='#ff6b6b', edgecolor='black')
        
        ax4.set_xlabel('Brand', fontsize=11)
        ax4.set_ylabel('Price (USD)', fontsize=11)
        ax4.set_title('Price Range by Brand', fontsize=13, fontweight='bold')
        ax4.set_xticks(x)
        ax4.set_xticklabels(price_stats.index, rotation=45, ha='right')
        ax4.legend()
        ax4.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/price_distribution_analysis.png', dpi=300, bbox_inches='tight')
        print(f"âœ“ Saved price distribution analysis to {self.output_dir}/price_distribution_analysis.png")
        plt.close()
        
        # Print insights
        print("\nðŸ’¡ Key Insights:")
        print(f"   â€¢ Most expensive brand (avg): {avg_price.idxmax()} (${avg_price.max():.2f})")
        print(f"   â€¢ Most affordable brand (avg): {avg_price.idxmin()} (${avg_price.min():.2f})")
        print(f"   â€¢ Widest price range: {price_stats.index[0]} (${price_stats['range'].iloc[0]:.2f})")
    
    def analyze_brand_popularity(self):
        """Analyze brand popularity and market share"""
        print("\n" + "=" * 60)
        print("ANALYSIS 2: Brand Popularity Analysis")
        print("=" * 60)
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Brand Popularity and Market Share Analysis', fontsize=16, fontweight='bold')
        
        # 1. Product count by brand
        ax1 = axes[0, 0]
        brand_counts = self.df['Brand'].value_counts()
        colors = sns.color_palette("husl", len(brand_counts))
        bars = ax1.bar(brand_counts.index, brand_counts.values, color=colors, edgecolor='black')
        ax1.set_xlabel('Brand', fontsize=11)
        ax1.set_ylabel('Number of Products', fontsize=11)
        ax1.set_title('Product Count by Brand', fontsize=13, fontweight='bold')
        ax1.tick_params(axis='x', rotation=45)
        
        for bar in bars:
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}',
                    ha='center', va='bottom', fontsize=10, fontweight='bold')
        ax1.grid(True, alpha=0.3, axis='y')
        
        # 2. Market share pie chart
        ax2 = axes[0, 1]
        wedges, texts, autotexts = ax2.pie(brand_counts.values, labels=brand_counts.index, 
                                            autopct='%1.1f%%', colors=colors,
                                            startangle=90, textprops={'fontsize': 10})
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
        ax2.set_title('Market Share by Brand', fontsize=13, fontweight='bold')
        
        # 3. Total review count by brand
        ax3 = axes[1, 0]
        total_reviews = self.df.groupby('Brand')['Review Count'].sum().sort_values(ascending=False)
        bars = ax3.barh(total_reviews.index, total_reviews.values, color='#4c6ef5', edgecolor='black')
        ax3.set_xlabel('Total Number of Reviews', fontsize=11)
        ax3.set_ylabel('Brand', fontsize=11)
        ax3.set_title('Customer Engagement by Brand (Total Reviews)', fontsize=13, fontweight='bold')
        
        for i, (bar, value) in enumerate(zip(bars, total_reviews.values)):
            ax3.text(value, bar.get_y() + bar.get_height()/2, 
                    f' {int(value)}', 
                    va='center', fontsize=9, fontweight='bold')
        ax3.grid(True, alpha=0.3, axis='x')
        
        # 4. Average rating by brand
        ax4 = axes[1, 1]
        avg_rating = self.df.groupby('Brand')['Rating'].mean().sort_values(ascending=False)
        bars = ax4.bar(avg_rating.index, avg_rating.values, color='#51cf66', edgecolor='black')
        ax4.set_xlabel('Brand', fontsize=11)
        ax4.set_ylabel('Average Rating', fontsize=11)
        ax4.set_title('Average Customer Rating by Brand', fontsize=13, fontweight='bold')
        ax4.set_ylim(0, 5)
        ax4.axhline(y=self.df['Rating'].mean(), color='red', linestyle='--', 
                    linewidth=2, label=f'Overall Avg: {self.df["Rating"].mean():.2f}')
        ax4.tick_params(axis='x', rotation=45)
        ax4.legend()
        ax4.grid(True, alpha=0.3, axis='y')
        
        for bar in bars:
            height = bar.get_height()
            ax4.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.2f}',
                    ha='center', va='bottom', fontsize=9, fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/brand_popularity_analysis.png', dpi=300, bbox_inches='tight')
        print(f"âœ“ Saved brand popularity analysis to {self.output_dir}/brand_popularity_analysis.png")
        plt.close()
        
        # Print insights
        print("\nðŸ’¡ Key Insights:")
        print(f"   â€¢ Most popular brand (by count): {brand_counts.idxmax()} ({brand_counts.max()} products)")
        print(f"   â€¢ Highest customer engagement: {total_reviews.idxmax()} ({total_reviews.max()} reviews)")
        print(f"   â€¢ Highest rated brand: {avg_rating.idxmax()} ({avg_rating.max():.2f}/5)")
        print(f"   â€¢ Market leader holds {brand_counts.max()/len(self.df)*100:.1f}% market share")
    
    def analyze_rating_vs_price(self):
        """Analyze correlation between rating and price"""
        print("\n" + "=" * 60)
        print("ANALYSIS 3: Rating vs Price Correlation")
        print("=" * 60)
        
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        fig.suptitle('Rating vs Price Correlation Analysis', fontsize=16, fontweight='bold')
        
        # 1. Scatter plot with trend line
        ax1 = axes[0]
        
        # Create scatter plot with different colors for each brand
        brands = self.df['Brand'].unique()
        colors = sns.color_palette("husl", len(brands))
        
        for brand, color in zip(brands, colors):
            brand_data = self.df[self.df['Brand'] == brand]
            ax1.scatter(brand_data['Price (USD)'], brand_data['Rating'], 
                       label=brand, alpha=0.6, s=100, color=color, edgecolor='black')
        
        # Add trend line
        z = np.polyfit(self.df['Price (USD)'], self.df['Rating'], 1)
        p = np.poly1d(z)
        x_trend = np.linspace(self.df['Price (USD)'].min(), self.df['Price (USD)'].max(), 100)
        ax1.plot(x_trend, p(x_trend), "r--", linewidth=2, label='Trend Line')
        
        # Calculate correlation
        correlation = self.df['Price (USD)'].corr(self.df['Rating'])
        
        ax1.set_xlabel('Price (USD)', fontsize=11)
        ax1.set_ylabel('Rating', fontsize=11)
        ax1.set_title(f'Price vs Rating Scatter Plot\n(Correlation: {correlation:.3f})', 
                     fontsize=13, fontweight='bold')
        ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax1.grid(True, alpha=0.3)
        
        # 2. Rating distribution by price range
        ax2 = axes[1]
        
        # Create price bins
        self.df['Price Range'] = pd.cut(self.df['Price (USD)'], 
                                        bins=[0, 500, 1000, 1500, 2000, 5000],
                                        labels=['$0-500', '$500-1000', '$1000-1500', 
                                               '$1500-2000', '$2000+'])
        
        sns.boxplot(data=self.df, x='Price Range', y='Rating', ax=ax2, palette="Set3")
        ax2.set_xlabel('Price Range', fontsize=11)
        ax2.set_ylabel('Rating', fontsize=11)
        ax2.set_title('Rating Distribution by Price Range', fontsize=13, fontweight='bold')
        ax2.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig(f'{self.output_dir}/rating_price_correlation.png', dpi=300, bbox_inches='tight')
        print(f"âœ“ Saved rating vs price correlation to {self.output_dir}/rating_price_correlation.png")
        plt.close()
        
        # Print insights
        print("\nðŸ’¡ Key Insights:")
        if correlation > 0.3:
            print(f"   â€¢ Positive correlation ({correlation:.3f}): Higher prices tend to have better ratings")
        elif correlation < -0.3:
            print(f"   â€¢ Negative correlation ({correlation:.3f}): Higher prices tend to have lower ratings")
        else:
            print(f"   â€¢ Weak correlation ({correlation:.3f}): Price and rating are largely independent")
        
        # Rating by price range
        rating_by_range = self.df.groupby('Price Range')['Rating'].mean()
        print(f"\n   Average ratings by price range:")
        for price_range, avg_rating in rating_by_range.items():
            print(f"     â€¢ {price_range}: {avg_rating:.2f}/5")
    
    def generate_comprehensive_report(self):
        """Generate a comprehensive summary visualization"""
        print("\n" + "=" * 60)
        print("Generating Comprehensive Report Dashboard")
        print("=" * 60)
        
        fig = plt.figure(figsize=(16, 10))
        gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
        
        fig.suptitle('Laptop E-Commerce Analysis: Comprehensive Report', 
                     fontsize=18, fontweight='bold')
        
        # 1. Key metrics (top row, full width)
        ax_metrics = fig.add_subplot(gs[0, :])
        ax_metrics.axis('off')
        
        metrics_text = f"""
        ðŸ“Š DATASET SUMMARY
        
        Total Products Analyzed: {len(self.df)}  |  Unique Brands: {self.df['Brand'].nunique()}  |  Total Reviews: {self.df['Review Count'].sum()}
        
        Average Price: ${self.df['Price (USD)'].mean():.2f}  |  Price Range: ${self.df['Price (USD)'].min():.2f} - ${self.df['Price (USD)'].max():.2f}  |  Average Rating: {self.df['Rating'].mean():.2f}/5
        """
        
        ax_metrics.text(0.5, 0.5, metrics_text, transform=ax_metrics.transAxes,
                       fontsize=12, verticalalignment='center', horizontalalignment='center',
                       bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 2. Brand market share
        ax1 = fig.add_subplot(gs[1, 0])
        brand_counts = self.df['Brand'].value_counts()
        ax1.pie(brand_counts.values[:5], labels=brand_counts.index[:5], autopct='%1.1f%%',
               startangle=90, colors=sns.color_palette("husl", 5))
        ax1.set_title('Top 5 Brands\nMarket Share', fontsize=11, fontweight='bold')
        
        # 3. Price distribution
        ax2 = fig.add_subplot(gs[1, 1])
        ax2.hist(self.df['Price (USD)'], bins=20, color='#4c6ef5', edgecolor='black', alpha=0.7)
        ax2.axvline(self.df['Price (USD)'].mean(), color='red', linestyle='--', linewidth=2)
        ax2.set_xlabel('Price (USD)', fontsize=10)
        ax2.set_ylabel('Frequency', fontsize=10)
        ax2.set_title('Price Distribution', fontsize=11, fontweight='bold')
        ax2.grid(True, alpha=0.3)
        
        # 4. Rating distribution
        ax3 = fig.add_subplot(gs[1, 2])
        rating_counts = self.df['Rating'].value_counts().sort_index()
        ax3.bar(rating_counts.index, rating_counts.values, color='#51cf66', edgecolor='black')
        ax3.set_xlabel('Rating', fontsize=10)
        ax3.set_ylabel('Count', fontsize=10)
        ax3.set_title('Rating Distribution', fontsize=11, fontweight='bold')
        ax3.grid(True, alpha=0.3, axis='y')
        
        # 5. Top brands by price
        ax4 = fig.add_subplot(gs[2, 0])
        top_brands_price = self.df.groupby('Brand')['Price (USD)'].mean().nlargest(5)
        ax4.barh(top_brands_price.index, top_brands_price.values, color='#ffd43b', edgecolor='black')
        ax4.set_xlabel('Avg Price (USD)', fontsize=10)
        ax4.set_title('Top 5 Brands\nby Average Price', fontsize=11, fontweight='bold')
        ax4.grid(True, alpha=0.3, axis='x')
        
        # 6. Top brands by rating
        ax5 = fig.add_subplot(gs[2, 1])
        top_brands_rating = self.df.groupby('Brand')['Rating'].mean().nlargest(5)
        ax5.barh(top_brands_rating.index, top_brands_rating.values, color='#ff8787', edgecolor='black')
        ax5.set_xlabel('Avg Rating', fontsize=10)
        ax5.set_title('Top 5 Brands\nby Average Rating', fontsize=11, fontweight='bold')
        ax5.set_xlim(0, 5)
        ax5.grid(True, alpha=0.3, axis='x')
        
        # 7. Price vs Rating scatter
        ax6 = fig.add_subplot(gs[2, 2])
        ax6.scatter(self.df['Price (USD)'], self.df['Rating'], alpha=0.5, color='#845ef7', s=50)
        z = np.polyfit(self.df['Price (USD)'], self.df['Rating'], 1)
        p = np.poly1d(z)
        x_trend = np.linspace(self.df['Price (USD)'].min(), self.df['Price (USD)'].max(), 100)
        ax6.plot(x_trend, p(x_trend), "r--", linewidth=2)
        ax6.set_xlabel('Price (USD)', fontsize=10)
        ax6.set_ylabel('Rating', fontsize=10)
        ax6.set_title('Price vs Rating\nCorrelation', fontsize=11, fontweight='bold')
        ax6.grid(True, alpha=0.3)
        
        plt.savefig(f'{self.output_dir}/comprehensive_report_dashboard.png', dpi=300, bbox_inches='tight')
        print(f"âœ“ Saved comprehensive report dashboard to {self.output_dir}/comprehensive_report_dashboard.png")
        plt.close()
    
    def print_final_insights(self):
        """Print final insights and conclusions"""
        print("\n" + "=" * 60)
        print("FINAL INSIGHTS AND CONCLUSIONS")
        print("=" * 60)
        
        # Calculate key metrics
        brand_counts = self.df['Brand'].value_counts()
        avg_prices = self.df.groupby('Brand')['Price (USD)'].mean()
        avg_ratings = self.df.groupby('Brand')['Rating'].mean()
        total_reviews = self.df.groupby('Brand')['Review Count'].sum()
        
        print("\nðŸŽ¯ KEY FINDINGS:\n")
        
        print("1. MARKET LEADER:")
        print(f"   â€¢ {brand_counts.idxmax()} dominates the market with {brand_counts.max()} products")
        print(f"   â€¢ This represents {brand_counts.max()/len(self.df)*100:.1f}% market share\n")
        
        print("2. PRICING INSIGHTS:")
        print(f"   â€¢ Premium brand: {avg_prices.idxmax()} (avg ${avg_prices.max():.2f})")
        print(f"   â€¢ Budget-friendly brand: {avg_prices.idxmin()} (avg ${avg_prices.min():.2f})")
        print(f"   â€¢ Overall market average: ${self.df['Price (USD)'].mean():.2f}\n")
        
        print("3. CUSTOMER SATISFACTION:")
        print(f"   â€¢ Highest rated brand: {avg_ratings.idxmax()} ({avg_ratings.max():.2f}/5)")
        print(f"   â€¢ Most reviewed brand: {total_reviews.idxmax()} ({total_reviews.max()} reviews)")
        print(f"   â€¢ Overall average rating: {self.df['Rating'].mean():.2f}/5\n")
        
        print("4. PRICE-QUALITY RELATIONSHIP:")
        correlation = self.df['Price (USD)'].corr(self.df['Rating'])
        if abs(correlation) < 0.3:
            print(f"   â€¢ Weak correlation ({correlation:.3f}): Price doesn't strongly predict quality")
            print("   â€¢ Budget laptops can offer good value for money")
        else:
            print(f"   â€¢ Notable correlation ({correlation:.3f}): Price relates to customer satisfaction")
        
        print("\n" + "=" * 60)

---
## 3. Run Analysis
### 3.1 Load Data

In [None]:
analysis = LaptopAnalysis("output/laptops_clean_data.csv")
analysis.load_data()

### 3.2 Price Distribution

In [None]:
analysis.analyze_price_distribution()

### 3.3 Brand Popularity

In [None]:
analysis.analyze_brand_popularity()

### 3.4 Rating vs Price

In [None]:
analysis.analyze_rating_vs_price()

### 3.5 Dashboard

In [None]:
analysis.generate_comprehensive_report()

### 3.6 Insights

In [None]:
analysis.print_final_insights()

---
## âœ“ Analysis Complete!