In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

## Data

In [None]:
df = pd.DataFrame() 
try:
    df = pd.read_csv('data.tsv', sep='\t', names=["URL", "Name", "Price", "Processor", "Storage", "RAM", "OperatingSystem", "ScreenSize", "Resolution", "Height", "Camera", "Colour"])
except FileNotFoundError:
    print("Error: File 'data.tsv' not found.")

In [None]:
df.shape

In [None]:
df.head()

## 1 Explorative Analysis

In [None]:
df.info()

In [None]:
df.isna().sum()

### 1.1 General info

In [None]:
df['Price'] = df['Price'].str.replace('£', '').str.replace(',', '').replace('N/A', np.nan).astype(float)
df['RAM'] = df['RAM'].str.replace('GB', '').replace('N/A', np.nan).astype(float)
df['Storage'] = pd.to_numeric(df['Storage'].str.replace(r'[^\d.]', '', regex=True), errors='coerce') * np.where(df['Storage'].str.contains('TB', na=False), 1024, 1)
df['ScreenSize'] = df['ScreenSize'].str.replace('Inch', '').str.replace('es', '').str.replace('in', '').replace('N/A', np.nan).astype(float)
df['Camera'] = pd.to_numeric(df['Camera'].str.replace(r'[^\d.]', '', regex=True), errors='coerce')
df['Height'] = pd.to_numeric(df['Height'].str.replace(r'[^\d.]', '', regex=True), errors='coerce')
df['Resolution'] = df['Resolution'].str.replace('pixels', '').replace('N/A', np.nan)
def get_res(val):
    nums = re.findall(r'(\d+)', str(val))
    return int(nums[0]) * int(nums[1]) if len(nums) >= 2 else np.nan

df['Resolution_Total'] = df['Resolution'].apply(get_res)
df.head()

In [None]:
df['OperatingSystem'].value_counts()

In [None]:
df.info()

In [None]:
df.describe().round(2)

In [None]:
result = df.select_dtypes(include=['number']).agg(['mean', 'median', lambda x: x.mode().iat[0]], axis=0)
result.index = ['mean', 'median', 'modus']
result

### 1.2 Data Distribution

In [None]:
# Distribution of Price (Histogram)
plt.figure(figsize=(10, 5))
sns.histplot(df['Price'], kde=True)
plt.title('Distribution of Tablet Prices')
# COMMENT: The distribution is right-skewed, showing a concentration of budget tablets with a long tail of luxury models.

In [None]:
plt.figure(figsize=(10, 6))
# Using 'hue' allows you to see how OS affects the distribution of Price
sns.histplot(data=df, x='OperatingSystem', hue='OperatingSystem', element="step")
plt.title('Distribution of Price Affected by Operating System') #změnit název!!
plt.xlabel('Operating System')
plt.show()

In [None]:
# Helper: Extract Brand for better visualization - logaritmické měřítko? asi?
df['Brand'] = df['Name'].apply(lambda x: x.split()[0])

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Brand', y='Price', hue='Brand')
plt.title('Price Range Distribution per Brand')
plt.ylabel('Price (£)')
plt.show()

In [None]:
# OS Price Distribution (Box Plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='OperatingSystem', y='Price', hue='OperatingSystem')
plt.title('Price Ranges by Operating System')
# COMMENT: iPadOS devices consistently occupy the higher price bracket compared to Android.

In [None]:
# RAM vs Price (Scatter Plot)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='RAM', y='Price', hue='OperatingSystem', s=100)
plt.title('RAM Capacity vs. Price')
# COMMENT: There is a clear positive correlation; higher RAM almost always results in a higher price point.

In [None]:
plt.figure(figsize=(10, 8))
# Swapping axes makes it easier to read categories
sns.violinplot(data=df, x='ScreenSize', y='Storage', orient='h', palette='muted')
plt.title('Distribution of Screen Sizes across Storage Tiers')
plt.xlabel('Screen Size (Inches)')
plt.ylabel('Storage (GB)')
plt.show()

In [None]:
# Storage vs Price (Violin Plot)
plt.figure(figsize=(12, 6))
sns.violinplot(data=df, x='Storage', y='Price', hue='Storage', palette='pastel')
plt.title('Price Density per Storage Tier')
# COMMENT: Shows that price variance increases significantly at higher storage capacities (512GB+).

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
correlation_cols = ['Price', 'Storage', 'RAM', 'Height', 'ScreenSize', 'Camera', 'Resolution_Total']
sns.heatmap(df[correlation_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Attribute Correlation Matrix')
# COMMENT: RAM and Storage have the strongest correlation with Price.
plt.show()

### 1.3 Outlier Detection

In [None]:
def outliers_iqr(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    
    return outliers

outliers_price   = outliers_iqr(df, 'Price')
outliers_storage = outliers_iqr(df, 'Storage')
outliers_ram     = outliers_iqr(df, 'RAM')
outliers_screen  = outliers_iqr(df, 'ScreenSize')
outliers_height  = outliers_iqr(df, 'Height')


In [None]:
# Price Outliers
df[outliers_ram][['Name', 'RAM']]

In [None]:
# Storage Outliers:
df[outliers_storage][['Name', 'Storage']]

In [None]:
# Price Outliers
df[outliers_price][['Name', 'Price']]

## 2 Preparation

### 2.1 Variant 1

In [None]:
df1 = df.drop(columns=['URL', 'Name', 'Resolution', 'Height', 'Camera', 'Processor', 'Colour'])
df1.isna().sum()

In [None]:

df1['OperatingSystem'] = df1['OperatingSystem'].fillna('Unknown')
df1['Price'] = df1['Price'].fillna(df1['Price'].median())
df1['Storage'] = df1['Storage'].fillna(df1['Storage'].median())
df1['RAM'] = df1['RAM'].fillna(df1['RAM'].median())
df1.isna().sum()

In [None]:
# Discretization
df1['Price_Tier'] = pd.cut(df1['Price'], bins=[0, 300, 700, 5000], labels=['Budget', 'MidRange', 'Premium'])
df1['Size_Tier'] = pd.cut(df1['ScreenSize'].fillna(10), bins=[0, 9, 11, 15], labels=['Small', 'Normal', 'Large'])
df1_final = df1.drop(columns=['Price', 'ScreenSize']).head(50)
df1_final.to_csv('variant1_categorical.csv', index=False)
df1_final.head()

### 2.2 Variant 2

In [None]:
df2 = df.drop(columns=['URL', 'Name', 'Resolution', 'Height', 'Camera', 'Processor', 'Colour'])
df2.isna().sum()