# Implementation Linear Regresion for predict laptop prices
В данном анализе у нас будет выборка данных, состоящая из ноутбуков их характеристик и розничной цены.

Описание переменныж: 

* 'Company' - Строчное  - Производитель товара
* 'Product’ -String - Бренд или модель 
* 'TypeName' -String -Тип (Ноутбук, Ультрабук, Игровой, и т.д)
* 'Inches' -Numeric- Размер экрана в дюймах
* 'ScreenResolution' -String- Разрешение экрана 
* 'Cpu' - String - Центральный процессов (CPU)
* 'Ram' -String- Количество оперативной памяти
* 'Memory' -String- Hard Disk / SSD Memory 
* 'GPU' -String- -графический процессов (GPU)
* 'OpSys' -String- Операционная система
* 'Weight' -String- Вес 
* 'Price_euros' - Ценна в Евро


In [241]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
from itertools import *

## Первичный анализ

In [242]:
df = pd.read_csv("Price_euros_train.csv")
df.head()

In [243]:
df.shape

In [244]:
df.isna().sum(axis=1).sum()

In [245]:
df.info()

## Работа с пропусками


In [246]:
for i in df.columns:
    if(df[i].isna().sum() != 0):
        print(f'{i} count empty cells: {df[i].isna().sum()}')
df.shape

### Пропуски в ScreenResolution
В данной выборке в большей части встречается расширение 1920x1980. Поэтому было принятно, что пропуски заполнить самым часто встречающимся

In [247]:
df['ScreenResolution'].value_counts()

In [248]:
df['ScreenResolution'] = ['1920x1080' if type(i) == float else i for i in df['ScreenResolution'] ]
df['ScreenResolution'].isnull().sum()

### Работа с пропусками OpSys
Тут было принято аналагичное решение, заполнить самым часто встречающимся 

In [249]:
df['OpSys'].value_counts()

In [250]:
df['OpSys'] = ['Windows 10' if type(i) == float else i for i in df['OpSys']]
df['OpSys'].value_counts()

### Работа с пропусками в Weight
Проанализировав распределение данных, было принято решение заполнить пропуски средним значением по выборке

In [251]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
df['Weight'].hist()

In [252]:
df['Weight'] = [df['Weight'].mean() if np.isnan(i) else i for i in df['Weight']]

In [253]:
df.isna().sum()

От пропусков мы успешно избавились

## Подготовка выборки для дальнешийх исследований
*Убираем GB из переменной Ram

In [254]:
df['Ram'].value_counts() ## Все в гб

In [255]:
df['Ram'] = df['Ram'].str.replace('GB', '')
df['Ram'] = df['Ram'].astype(float)
df['Ram'].describe()

In [256]:
df['WidthAndHeight'] = [i.split()[-1] if type(i) != float else np.nan for i in df['ScreenResolution']]

df['Width'] = [i.split('x')[0] if type(i) != float else np.nan for i in df['WidthAndHeight']]
df['Height'] = [i.split('x')[-1] if type(i) != float else np.nan for i in df['WidthAndHeight']]

df['Touchscreen'] = [1 if (type(i) != float) and ('Touchscreen' in i) else 0 for i in df['ScreenResolution']]

df['Width'] = df['Width'].astype(float)
df['Height'] = df['Height'].astype(float)

df.head()

### Удаление ненужных столбцов

In [257]:
df.drop(['ScreenResolution'], axis=1, inplace=True)
df.drop(['WidthAndHeight'], axis=1, inplace=True)
df.drop(['laptop_ID'], axis=1, inplace=True)
df.drop(['Gpu'], axis=1, inplace=True)
df.drop(['Product'], axis=1, inplace=True)
df.head()

### Cpu 
Разобьем на бренд и частоту

In [258]:
df['frequency'] =  [x.split(' ')[-1].replace('GHz', '') for x in df['Cpu']]
df['frequency'] = df['frequency'].astype(float)

df['brand_cpu'] =  [x.split(' ')[0] for x in df['Cpu']]

df.drop(['Cpu'], axis=1, inplace=True)
df.head()

### Сопоставим ОС с числом
* Windows - 0
* MacOs - 1
* Linux - 2
* Остальные - 3

In [259]:
df['OpSys'].value_counts()

In [260]:
df['OS'] = df['OpSys']

df.loc[df['OS'].str.contains('Windows'), 'OS'] = '0'
df.loc[df['OS'].str.contains('macOS|Mac OS X'), 'OS'] = '1'
df.loc[df['OS'].str.contains('Linux'), 'OS'] = '2'
df.loc[~df['OS'].isin(['0', '1', '2']), 'OS'] = 3

df.drop(['OpSys'], axis=1, inplace=True)

df['OS'].value_counts()

In [261]:
df.head()

### Преобразование Memory
Так как у нас в данной переменной лежат данные и в Гб И Тб, нужно это учесть. 
Добавим новые столбцы HDD, SSD, Flash Storage, Hybrid. Инициализируем их числовыми значениями, если они есть. В противном случае нулем. 

In [262]:
df['Memory'].value_counts()

In [263]:
def separation_components(str):
    my_list = str.split(' ')
    my_list = list(filter(None, my_list))

    str = ' '.join(my_list)

    ans = {'HDD': 0, 'SSD': 0, 'Flash Storage': 0, 'Hybrid': 0}
    components = str.split(' + ')

    for i in components:
        disk_type = i.split(' ')[1:]
        disk_type = ' '.join(disk_type)
        
        memory_size = i.split(' ')[0].replace('GB', '')

        if('TB' in memory_size):
            memory_size = memory_size.replace('TB', '')
            memory_size = float(memory_size) * 1000
            
        ans[disk_type] = memory_size

    return ans

In [264]:
HDD = []
SSD = []
Flash_Storage = []
Hybrid = []

for i in df['Memory']:
    memory_types_size = separation_components(i)
    HDD.append(memory_types_size['HDD'])
    SSD.append(memory_types_size['SSD'])
    Flash_Storage.append(memory_types_size['Flash Storage'])
    Hybrid.append(memory_types_size['Hybrid'])

df['HDD'] = HDD
df['SSD'] = SSD
df['Flash Storage'] = Flash_Storage
df['Hybrid'] = Hybrid

In [265]:
df.head()

In [266]:
df.drop(['Memory'], axis=1, inplace=True)

In [267]:
df.head()

### 

In [268]:
df['OS'] = df['OS'].astype(float)
df['Flash Storage'] = df['Flash Storage'].astype(float)
df['Hybrid'] = df['Hybrid'].astype(float)
df['SSD'] = df['SSD'].astype(float)
df['HDD'] = df['HDD'].astype(float)
df.info()

### Brand_cpu
Преобрзуем в числовые значения 
* Intel - 0
* Amd - 1
* Samsung - 2
* В других случаях - 3

In [269]:
df['brand_cpu'].value_counts()

In [270]:
replace_dict = {'Intel': 0, 'Amd': 1, 'Samsung': 2}
df['brand_cpu'] = df['brand_cpu'].map(replace_dict).fillna(3)

In [271]:
df['brand_cpu'].value_counts()

In [272]:
df.head()

### TypeName
Наглядной зависимости между TypeNmae и Price_euros не наблюдается. Поэтому пренебрежем этой переменной и удалим ее из нашего датасета.

In [273]:
df['TypeName'].value_counts()

In [274]:
plt.plot(df['TypeName'], df['Price_euros'], 'o')
plt.xlabel('TypeName')
plt.ylabel('Price_euros')
plt.show()

In [275]:
df.drop(['TypeName'], inplace=True, axis=1)

### Company

In [276]:
df.head()

In [277]:
company_names = df['Company'].unique()

company_dic = {}

for i in range(len(company_names)):
    company_dic[company_names[i]] = i
    
company_dic

In [278]:
df['new_company'] = df['Company'].map(company_dic).fillna(18)

In [279]:
df

In [280]:
df.drop(['Company'], axis=1, inplace=True)

In [281]:
df.head()

In [282]:
plt.figure(figsize=(16, 14))

dataplot = sns.heatmap(df.corr(), cmap="RdPu", annot=True,annot_kws={'fontsize': 15})

plt.title("Карта корреляции данных", fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.show()

## Linear Regression  

In [283]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR

In [284]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [285]:
X = df.drop(['Price_euros'], axis=1)
y = df['Price_euros']

In [286]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(f'Train dataset size: {X_train.shape}, {y_train.shape}')
print(f'Test dataset size: {X_test.shape}, {y_test.shape}')

In [287]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

In [288]:
y_pred = model.predict(X_test)

In [289]:
from sklearn import metrics 

print("MAE", metrics.mean_absolute_error(y_test, y_pred))
print("MSE", metrics.mean_squared_error(y_test, y_pred))
print("R2 Score:", model.score(X_test, y_test))

## Кроссвалидация

In [290]:
from sklearn.model_selection import KFold, cross_val_score

# Define the number of folds for cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [291]:
lin_reg = LinearRegression()

scores = cross_val_score(lin_reg, X, y, cv=kf, scoring='r2')
scores

In [292]:
mean_score = np.mean(scores)
mean_score

## Test data

In [293]:
df = pd.read_csv('Price_euros_test.csv')
df

### Подготовим тестувую выборку

In [294]:
df['ScreenResolution'] = ['1920x1080' if type(i) == float else i for i in df['ScreenResolution'] ]
df['ScreenResolution'].isnull().sum()

In [295]:
df['OpSys'] = ['Windows 10' if type(i) == float else i for i in df['OpSys']]
df['OpSys'].value_counts()

In [296]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
df['Weight'] = [df['Weight'].mean() if np.isnan(i) else i for i in df['Weight']]

In [297]:
df['Ram'] = df['Ram'].str.replace('GB', '')
df['Ram'] = df['Ram'].astype(float)

In [298]:
df['WidthAndHeight'] = [i.split()[-1] if type(i) != float else np.nan for i in df['ScreenResolution']]

df['Width'] = [i.split('x')[0] if type(i) != float else np.nan for i in df['WidthAndHeight']]
df['Height'] = [i.split('x')[-1] if type(i) != float else np.nan for i in df['WidthAndHeight']]

df['Touchscreen'] = [1 if (type(i) != float) and ('Touchscreen' in i) else 0 for i in df['ScreenResolution']]

df['Width'] = df['Width'].astype(float)
df['Height'] = df['Height'].astype(float)

df.head()

In [299]:
df.drop(['ScreenResolution'], axis=1, inplace=True)
df.drop(['WidthAndHeight'], axis=1, inplace=True)
df.drop(['laptop_ID'], axis=1, inplace=True)
df.drop(['Gpu'], axis=1, inplace=True)
df.drop(['Product'], axis=1, inplace=True)
df.head()

In [300]:
df['frequency'] =  [x.split(' ')[-1].replace('GHz', '') for x in df['Cpu']]
df['frequency'] = df['frequency'].astype(float)

df['brand_cpu'] =  [x.split(' ')[0] for x in df['Cpu']]

df.drop(['Cpu'], axis=1, inplace=True)
df.head()

In [301]:
df['OS'] = df['OpSys']

df.loc[df['OS'].str.contains('Windows'), 'OS'] = '0'
df.loc[df['OS'].str.contains('macOS|Mac OS X'), 'OS'] = '1'
df.loc[df['OS'].str.contains('Linux'), 'OS'] = '2'
df.loc[~df['OS'].isin(['0', '1', '2']), 'OS'] = 3

df.drop(['OpSys'], axis=1, inplace=True)

df['OS'].value_counts()

In [302]:
HDD = []
SSD = []
Flash_Storage = []
Hybrid = []

for i in df['Memory']:
    memory_types_size = separation_components(i)
    HDD.append(memory_types_size['HDD'])
    SSD.append(memory_types_size['SSD'])
    Flash_Storage.append(memory_types_size['Flash Storage'])
    Hybrid.append(memory_types_size['Hybrid'])

df['HDD'] = HDD
df['SSD'] = SSD
df['Flash Storage'] = Flash_Storage
df['Hybrid'] = Hybrid
df.drop(['Memory'], axis=1, inplace=True)

In [303]:
df.head()

In [304]:
df['OS'] = df['OS'].astype(float)
df['Flash Storage'] = df['Flash Storage'].astype(float)
df['Hybrid'] = df['Hybrid'].astype(float)
df['SSD'] = df['SSD'].astype(float)
df['HDD'] = df['HDD'].astype(float)
df.info()

In [305]:
replace_dict = {'Intel': 0, 'Amd': 1, 'Samsung': 2}
df['brand_cpu'] = df['brand_cpu'].map(replace_dict).fillna(3)

In [306]:
df.drop(['TypeName'], inplace=True, axis=1)

In [307]:
company_names = df['Company'].unique()

company_dic = {}

for i in range(len(company_names)):
    company_dic[company_names[i]] = i
    
company_dic

In [308]:
df['new_company'] = df['Company'].map(company_dic).fillna(18)

In [309]:
df.drop(['Company'], axis=1, inplace=True)

In [317]:
df.head()

Unnamed: 0,Inches,Ram,Weight,Width,Height,Touchscreen,frequency,brand_cpu,OS,HDD,SSD,Flash Storage,Hybrid,new_company
0,15.6,4.0,2.1,1366.0,768.0,0,3.0,3.0,0.0,1000.0,0.0,0.0,0.0,0
1,14.0,16.0,1.95,1920.0,1080.0,0,2.8,0.0,0.0,0.0,1000.0,0.0,0.0,1
2,15.6,8.0,1.7,1920.0,1080.0,0,1.8,0.0,0.0,0.0,256.0,0.0,0.0,2
3,17.3,16.0,2.99,1920.0,1080.0,0,2.8,0.0,0.0,1000.0,256.0,0.0,0.0,2
4,15.6,12.0,2.25,1366.0,768.0,0,2.5,0.0,0.0,1000.0,0.0,0.0,0.0,3


## Predict

In [311]:
X_test = df
y_pred = model.predict(X_test)
y_pred

In [312]:
result = pd.DataFrame()
result['Price_euros'] = y_pred

In [313]:
result.to_csv('result.csv')

In [318]:
result

Unnamed: 0,Price_euros
0,339.569281
1,2459.263758
2,968.152623
3,1696.515924
4,1036.547167
...,...
190,758.287561
191,737.516935
192,1355.438765
193,1961.438819
