In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data

In [None]:
df = pd.DataFrame() 
try:
    df = pd.read_csv('data.tsv', sep='\t', names=["URL", "Name", "Price", "Processor", "Storage", "RAM", "OperatingSystem", "ScreenSize", "Resolution", "Height", "Camera", "Colour"])
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: File 'data.tsv' not found.")

df.head()

## 1 Explorative Analysis

### 1.1 General info

In [None]:
df.info()

### Statistics

In [None]:
df.describe(include='all')

In [None]:
df['OperatingSystem'].value_counts()

In [None]:
df['Price'] = df['Price'].str.replace('£', '').str.replace(',', '').replace('N/A', np.nan).astype(float)
df['RAM'] = df['RAM'].str.replace('GB', '').replace('N/A', np.nan).astype(float)
df['Storage'] = pd.to_numeric(df['Storage'].str.replace(r'[^\d.]', '', regex=True), errors='coerce') * np.where(df['Storage'].str.contains('TB', na=False), 1024, 1)
df['ScreenSize'] = df['ScreenSize'].str.replace('Inch', '').str.replace('es', '').str.replace('in', '').replace('N/A', np.nan).astype(float)
df['Camera'] = pd.to_numeric(df['Camera'].str.replace(r'[^\d.]', '', regex=True), errors='coerce')
df['Height'] = pd.to_numeric(df['Height'].str.replace(r'[^\d.]', '', regex=True), errors='coerce')
df['Resolution'] = df['Resolution'].str.replace('pixels', '').replace('N/A', np.nan)

## Data Distribution

In [None]:
plt.figure(figsize=(10, 6))
# Using 'hue' allows you to see how OS affects the distribution of Price
sns.histplot(data=df, x='Price_Num', hue='OperatingSystem', kde=True, element="step", palette='viridis')
plt.title('Distribution of Price Affected by Operating System')
plt.xlabel('Price (£)')
plt.show()

In [None]:
plt.figure(figsize=(12, 7))
# Size of the dots is determined by Screen Size, showing 3-way interaction
sns.scatterplot(data=df, x='RAM_GB', y='Price_Num', size='ScreenSize_Num', hue='OperatingSystem', alpha=0.6, sizes=(20, 500))
plt.title('Price vs. RAM (Dot Size = Screen Size)')
plt.ylabel('Price (£)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show()

In [None]:
# Helper: Extract Brand for better visualization
df['Brand'] = df['Name'].apply(lambda x: x.split()[0])

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Brand', y='Price_Num', palette='Set3')
plt.title('Price Range Distribution per Brand')
plt.ylabel('Price (£)')
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
# Swapping axes makes it easier to read categories
sns.violinplot(data=df, x='ScreenSize_Num', y='Storage_GB', orient='h', palette='muted')
plt.title('Distribution of Screen Sizes across Storage Tiers')
plt.xlabel('Screen Size (Inches)')
plt.ylabel('Storage (GB)')
plt.show()

In [None]:
# A JointPlot shows the relationship AND the individual distributions at once
g = sns.jointplot(data=df, x='ScreenSize_Num', y='Price_Num', kind="reg", color="m", height=7)
g.fig.suptitle('Joint Distribution: Screen Size vs. Price', y=1.03)
plt.show()

### Outlier Detection

In [None]:
# IQR Method for Price
Q1 = df['Price_Num'].quantile(0.25)
Q3 = df['Price_Num'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Price_Num'] < lower_bound) | (df['Price_Num'] > upper_bound)]

print(f"Number of price outliers detected: {len(outliers)}")
outliers[['Name', 'Price_Num']]

### Missing Value Analysis

In [None]:
print("Missing values per column:")
df.isnull().sum()