In [None]:
# Laptop Price Analysis — Ready-to-run Jupyter-style Python notebook
# Save this file as `laptop_price_analysis.py` or open it in Jupyter/VS Code (it uses cell markers # %% ).
# Instructions:
# 1. Put the laptop CSV file in the same folder and name it `laptop_prices.csv` (or change the path below).
# 2. Run the cells in order. If any library is missing, install it via pip (see cell below).

# %%
# 0) Install required libraries (run once if needed)
# !pip install pandas numpy matplotlib seaborn scikit-learn joblib
# If you'd like to try XGBoost uncomment the next line (optional):
# !pip install xgboost

# %%
# 1) Imports
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

sns.set(style='whitegrid')

# %%
# 2) Load the dataset
# Put your CSV in the same folder as this notebook and name it 'laptop_prices.csv'
DATA_PATH = 'laptop_prices.csv'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"{DATA_PATH} not found. Please place the CSV in the working directory and try again.")

df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1')
print('Dataset loaded — shape:', df.shape)

# Quick peek
print('\n--- head ---')
print(df.head(3).T)

# %%
# 3) Basic info & missing values
print('\n--- info ---')
print(df.info())
print('\n--- missing values ---')
print(df.isnull().sum())

# %%
# 4) Quick descriptive statistics
print('\n--- describe ---')
print(df.describe(include='all').T)

# %%
# 5) Feature engineering
# Create a Pixel-Per-Inch (PPI) feature from ScreenW, ScreenH and Inches (if available)
if {'ScreenW', 'ScreenH', 'Inches'}.issubset(df.columns):
    df['PPI'] = ((df['ScreenW']**2 + df['ScreenH']**2)**0.5 / df['Inches']).round(2)
else:
    df['PPI'] = np.nan

# Convert Touchscreen, IPSpanel, RetinaDisplay strings (Yes/No) to 0/1 if necessary
for col in ['Touchscreen', 'IPSpanel', 'RetinaDisplay']:
    if col in df.columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0}).fillna(df[col])

# View new columns
print('\n--- columns after feature engineering ---')
print(df.columns.tolist())

# %%
# 6) Exploratory Data Analysis (plots)
# A selection of useful EDA visuals; run interactively to inspect
plt.figure(figsize=(10,5))
sns.histplot(df['Price_euros'], bins=40, kde=True)
plt.title('Distribution of Laptop Prices (Euros)')
plt.xlabel('Price_euros')
plt.show()

# Company counts
plt.figure(figsize=(12,4))
sns.countplot(y='Company', data=df, order=df['Company'].value_counts().index)
plt.title('Number of laptops per Company')
plt.show()

# Price by Company (boxplot)
plt.figure(figsize=(14,5))
sns.boxplot(x='Company', y='Price_euros', data=df)
plt.xticks(rotation=45)
plt.title('Price distribution by Company')
plt.show()

# Touchscreen vs price
if 'Touchscreen' in df.columns:
    plt.figure(figsize=(6,4))
sns.boxplot(x='Touchscreen', y='Price_euros', data=df)
plt.title('Touchscreen vs Price')
plt.show()

# RAM vs Price (barplot with mean)
if 'Ram' in df.columns:
    plt.figure(figsize=(8,4))
sns.barplot(x='Ram', y='Price_euros', data=df, estimator=np.mean)
plt.title('Average Price by RAM')
plt.show()

# PPI vs Price scatter
if 'PPI' in df.columns:
    plt.figure(figsize=(7,5))
sns.scatterplot(x='PPI', y='Price_euros', data=df)
plt.title('PPI vs Price')
plt.show()

# %%
# 7) Preprocessing for modeling
# Select features to use. We'll drop columns that are high-cardinality product names or duplicates.
DROP_COLS = ['Product'] if 'Product' in df.columns else []
# Keep relevant columns (adjust according to your dataset)
candidate_features = [
    'Company','TypeName','Inches','Ram','OS','Weight','Touchscreen','IPSpanel','RetinaDisplay',
    'PPI','CPU_company','CPU_freq','PrimaryStorage','PrimaryStorageType','SecondaryStorage','SecondaryStorageType',
    'GPU_company'
]

# Keep only those that exist in the df
features = [c for c in candidate_features if c in df.columns]
print('\nUsing features:', features)

# Target
TARGET = 'Price_euros'
if TARGET not in df.columns:
    raise KeyError(f"Target column '{TARGET}' not found in dataset.")

X = df[features].copy()
y = df[TARGET].copy()

# Quick cleanup: if any boolean-like columns remain as strings map them
for col in X.select_dtypes(include='object').columns:
    if X[col].nunique() == 2 and set(X[col].dropna().unique()) <= set(['Yes','No']):
        X[col] = X[col].map({'Yes':1,'No':0})

# Show head
print('\n--- X sample ---')
print(X.head())

# %%
# 8) Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('\nTrain size:', X_train.shape, 'Test size:', X_test.shape)

# %%
# 9) Build preprocessing pipeline
# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print('\nNumeric columns:', numeric_cols)
print('Categorical columns:', cat_cols)

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', cat_transformer, cat_cols)
])

# %%
# 10) Modeling — baseline (Linear Regression), then RandomForest with GridSearch
# Baseline: Linear Regression
pipe_lr = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])
pipe_lr.fit(X_train, y_train)

y_pred_lr = pipe_lr.predict(X_test)
print('\nLinear Regression — MSE:', mean_squared_error(y_test, y_pred_lr), 'R2:', r2_score(y_test, y_pred_lr))

# %%
# Random Forest with a small GridSearch for better performance
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=42))])

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
}

gsearch = GridSearchCV(pipe_rf, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=1)
gsearch.fit(X_train, y_train)

print('\nBest params:', gsearch.best_params_)
best_rf = gsearch.best_estimator_

# Evaluate
y_pred_rf = best_rf.predict(X_test)
print('RandomForest — MSE:', mean_squared_error(y_test, y_pred_rf), 'R2:', r2_score(y_test, y_pred_rf))

# %%
# 11) Feature importance (for RandomForest) — need to extract names after one-hot encoding
# Get feature names from preprocessor
onehot_cols = []
if cat_cols:
    # get feature names for onehot
    ohe = best_rf.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
    cat_names = ohe.get_feature_names_out(cat_cols).tolist()
else:
    cat_names = []

feature_names = numeric_cols + cat_names
importances = best_rf.named_steps['regressor'].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(30)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Top 30 Feature Importances (RandomForest)')
plt.show()

# %%
# 12) Final evaluation with selected model and simple residual plot
model = best_rf
y_pred = model.predict(X_test)
print('\nFinal model R2:', r2_score(y_test, y_pred))
print('Final model RMSE:', mean_squared_error(y_test, y_pred, squared=False))

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted — Final Model')
plt.show()

# Residuals
residuals = y_test - y_pred
plt.figure(figsize=(7,4))
sns.histplot(residuals, bins=40, kde=True)
plt.title('Residuals Distribution')
plt.show()

# %%
# 13) Save the trained model
MODEL_PATH = 'laptop_price_model.joblib'
joblib.dump(model, MODEL_PATH)
print(f'Model saved to {MODEL_PATH}')

# %%
# 14) Example: load model & make a single prediction
# Build a sample datapoint using the feature format — adapt values to your dataset
sample = X_test.iloc[0:1].copy()
print('\nSample input:')
print(sample)

loaded = joblib.load(MODEL_PATH)
sample_pred = loaded.predict(sample)
print('\nPredicted price for sample:', sample_pred[0])
print('Actual price:', y_test.iloc[0])

# %%
# 15) Next steps (suggestions)
# - Try GradientBoostingRegressor or XGBoost for better performance.
# - Use more advanced hyperparameter tuning (RandomizedSearchCV, larger grids).
# - Do more feature engineering (parse CPU_model, GPU_model, storage sizes as categories).
# - If dataset is large, consider using categorical encoding methods like TargetEncoder.
# - Save preprocessor and model separately if needed for deployment.

print('\nNotebook finished. Good luck!')
