# Лаба 2 - Линейная регрессия

## Задача
В этом наборе данных вам предстоит решить актуальную задачу для поисковиков: **нужно оценить насколько привлекателен веб-ресурс по некоторому набору факторов**.

В наборе данных представлено 8000 записей о различных анонимизированных доменах и соответствующие оценки привлекательности(числовые)

Нужно понять насколько домен привлекателен по остальным факторам.

## Описание столбцов

| столбец                | описание                                            |
|------------------------|-----------------------------------------------------|
| category               | категория к которой относится сайт                  |
| clicks                 | кол-во кликов по домену                             |
| likes                  | кол-во лайков поставленных домену                   |
| buys                   | кол-во покупок совершенных на домене                |
| 4xx_errors             | кол-во ошибок с кодом 4хх за последние 6 мес        |
| 5xx_errors             | кол-во ошибок с кодом 5хх за последние 6 мес        |
| complaints_count       | кол-во жалоб на домен                               |
| average_dwelltime      | среднее время проведенное пользователем на домене ( в минутах) |
| date_of_registration   | дата регистрации домена                             |
| source_attractiveness  | привлекательность домена (таргет)                   |

### Пояснение к задаче
Ваша задача предстоит не только в написании кода обучения модели. Глобально вы должны предоставить ноутбук с полноценным анализом данных, очисткой данных, сделать выводы на основе графического анализа.

За отсутствие надлежащего оформления будут снижаться баллы. В критических случаях - лаба не будет принята.

In [100]:
import pandas as pd
# pd.options.display.max_rows = None

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn import metrics

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [101]:
train_data_path = 'train.csv'
test_data_path = 'test.csv'

In [102]:
initial_df = pd.read_csv(train_data_path)
initial_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
initial_df.set_index('id', inplace=True)

Натренируем модель на начальном датасете

In [103]:
default_df = initial_df.copy(deep=True)
default_df.dropna(inplace=True)

def non_numeric_to_zero(row):
    try: return int(row)
    except: return 0

default_df['date_of_registration'] = pd.to_datetime(default_df['date_of_registration'])
default_df['age'] = (pd.Timestamp('2024-09-30') - default_df['date_of_registration']) / pd.Timedelta(days=365)
default_df.drop('date_of_registration', axis='columns', inplace=True)

default_df['complaints_count'] = default_df['complaints_count'].apply(non_numeric_to_zero)

print(f'После чистки данных: {default_df.shape}')


# divide to test and train
default_x_train, default_x_test, default_y_train, default_y_test = train_test_split(
    default_df.drop('source_attractiveness', axis='columns'),
    default_df['source_attractiveness'],
    test_size=0.2,
    shuffle=True,
    stratify=default_df['category']
)


# encode categories 
encoder = OneHotEncoder(drop='first', sparse_output=False)
default_x_train = np.hstack([default_x_train.drop(['category'], axis='columns'), encoder.fit_transform(default_x_train['category'].to_frame())])
default_x_test = np.hstack([default_x_test.drop(['category'], axis='columns'), encoder.transform(default_x_test['category'].to_frame())])

default_model = LinearRegression(fit_intercept=True)
default_model.fit(default_x_train, default_y_train)

После чистки данных: (6931, 10)


In [104]:
default_test_predicts = default_model.predict(default_x_test)
print(f'TEST:\n\
    MSE = {metrics.mean_squared_error(default_y_test, default_test_predicts)}\n\
    MAE = {metrics.mean_absolute_error(default_y_test, default_test_predicts)}\n\
    MAPE = {metrics.mean_absolute_percentage_error(default_y_test, default_test_predicts) * 100:.1f}%'
)

default_train_predicts = default_model.predict(default_x_train)
print(f'TRAIN:\n\
    MSE = {metrics.mean_squared_error(default_y_train, default_train_predicts)}\n\
    MAE = {metrics.mean_absolute_error(default_y_train, default_train_predicts)}\n\
    MAPE = {metrics.mean_absolute_percentage_error(default_y_train, default_train_predicts) * 100:.1f}%'
)


go.Figure(
    data=[
        go.Histogram(x=default_test_predicts, name='test'),
        go.Histogram(x=default_train_predicts, name='train'),
        go.Histogram(x=initial_df['source_attractiveness'], name='target'),
    ],
    layout=dict(title='Дефолтная модель')
).show()

TEST:
    MSE = 0.01622472714353693
    MAE = 0.08756491880427221
    MAPE = 343.6%
TRAIN:
    MSE = 0.015761281626669375
    MAE = 0.08699043102188775
    MAPE = 210.6%


Наша цель - сделать лучше

Настраиваем данные

In [105]:
df = initial_df.copy(deep=True)

print(f'До чистки данных: {df.shape}')

def setup_cleared_df(df):
    df_cleared = df.dropna()

    df_cleared.drop(df_cleared[df_cleared['clicks'] < 0].index, inplace=True)
    df_cleared.drop(df_cleared[df_cleared['likes'] < 0].index, inplace=True)
    df_cleared.drop(df_cleared[df_cleared['buys'] < 0].index, inplace=True)
    df_cleared.drop(df_cleared[df_cleared['4xx_errors'] < 0].index, inplace=True)
    df_cleared.drop(df_cleared[df_cleared['5xx_errors'] < 0].index, inplace=True)
    df_cleared.drop(df_cleared[df_cleared['average_dwelltime'] < 0].index, inplace=True)

    df_cleared['date_of_registration'] = pd.to_datetime(df_cleared['date_of_registration'])
    df_cleared['age'] = (pd.Timestamp('2024-09-30') - df_cleared['date_of_registration']) / pd.Timedelta(days=365)
    df_cleared.drop(['date_of_registration'], axis='columns', inplace=True)

    df_cleared['clicks'] = df_cleared['clicks'].astype('int32')
    df_cleared['likes'] = df_cleared['likes'].astype('int32')

    def remove_non_numeric(row):
        try: return int(row)
        except: return -1

    df_cleared['complaints_count'] = df_cleared['complaints_count'].apply(remove_non_numeric)
    df_cleared.drop(df_cleared[df_cleared['complaints_count'] == -1].index, inplace=True)

    return df_cleared

df = setup_cleared_df(df)

print(f'После чистки данных: {df.shape}')

До чистки данных: (8000, 10)
После чистки данных: (6229, 10)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

Посмотрим на некоторые распределения в датасете:

In [106]:
categories = df.groupby('category')
categories_count = categories.count().max(axis=1)
categories_likes = categories['likes'].sum()
categories_buys = categories['buys'].sum()
categories_clicks = categories['clicks'].sum()
categories_4xx = categories['4xx_errors'].sum()
categories_5xx = categories['5xx_errors'].sum()
categories_complaints = categories['complaints_count'].sum()

categories_attractiveness = categories['source_attractiveness']


categories_bars = make_subplots(
    rows=7,
    subplot_titles=[
        'Количество',
        'Кол-во кликов',
        'Кол-во лайков',
        'Кол-во покупок',
        'Кол-во ошибок',
        'Кол-во жалоб',
        'Привлекательность',
    ]
)

categories_bars.add_trace(
    go.Bar(
        name='Кол-во доменов',
        x=categories_count.index,
        y=categories_count.values,
    ),
    row=1, col=1,
)

categories_bars.add_trace(
    go.Bar(
        name='Кол-во кликов',
        x=categories_clicks.index,
        y=categories_clicks.values,
    ),
    row=2, col=1,
)

categories_bars.add_trace(
    go.Bar(
        name='Кол-во лайков',
        x=categories_likes.index,
        y=categories_likes.values,
    ),
    row=3, col=1,
)

categories_bars.add_trace(
    go.Bar(
        name='Кол-во покупок',
        x=categories_buys.index,
        y=categories_buys.values,
    ),
    row=4, col=1,
)
categories_bars.update_yaxes(type='log', range=[1, 10], row=4, col=1)

categories_bars.add_trace(
    go.Bar(
        name='Кол-во 4xx ошибок',
        x=categories_4xx.index,
        y=categories_4xx.values,
    ),
    row=5, col=1,
)
categories_bars.add_trace(
    go.Bar(
        name='Кол-во 5xx ошибок',
        x=categories_5xx.index,
        y=categories_5xx.values,
    ),
    row=5, col=1,
)
categories_bars.update_yaxes(type='log', row=5, col=1)

categories_bars.add_trace(
    go.Bar(
        name='Кол-во жалоб',
        x=categories_complaints.index,
        y=categories_complaints.values,
    ),
    row=6, col=1,
)

categories_bars.add_trace(
    go.Bar(
        name='Минимальная привлекательность',
        x=categories_attractiveness.min().index,
        y=categories_attractiveness.min().values,
    ),
    row=7, col=1,
)
categories_bars.add_trace(
    go.Bar(
        name='Средняя привлекательность',
        x=categories_attractiveness.mean().index,
        y=categories_attractiveness.mean().values,
    ),
    row=7, col=1,
)
categories_bars.add_trace(
    go.Bar(
        name='Максимальная привлекательность',
        x=categories_attractiveness.max().index,
        y=categories_attractiveness.max().values,
    ),
    row=7, col=1,
)

categories_bars.update_layout(
    title='Разбиение доменов по категориям',
    barmode='group',
    width=1000, height=2000,
)

categories_bars.show()

Идея - разбить домены по категории и для каждой категории натренировать отдельную модель *(пока лень)*

Идея - нормализовать привлекательность домена

Идея - создать новую фичу - активность, которая считается как отношение суммы действий на домене к количеству кликов

~~Идея - забить на неважные фичи - ошибки, дата регистрации.~~ После пары попыток обучения, выяснилось, что "возраст" сайта и количество ошибок важны

Обучим новую модель на основе полученной информации

In [107]:
def setup_df_activity(df):
    df_activity = df[[
        'age',
        'likes', 'buys',
        'category',
        'clicks',
        'complaints_count',
        'average_dwelltime',
    ]]

    df_activity['activity'] = (df['likes'] + df['buys']) / df['clicks']
    # df_activity['experience'] = - df['complaints_count'] / df['clicks']
    # df_activity['time_spent'] = df['average_dwelltime'] * df['clicks']
    df_activity['errors'] = df['4xx_errors'] + df['5xx_errors']

    return df_activity

df_activity = setup_df_activity(df)
y = df['source_attractiveness']

activity_attractivenes = go.Figure(
    go.Scatter(
        x=df_activity['activity'],
        y=y,
        mode='markers',
        marker=dict(size=1),
    ),
)

activity_attractivenes.update_xaxes(title_text='Активность')
activity_attractivenes.update_yaxes(title_text='Привлекательность')
activity_attractivenes.update_layout(width=700, height=500)
activity_attractivenes.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Появилась линейная зависимость, значит мы на верноv

In [108]:
# divide to test and train
x_train, x_test, y_train, y_test = train_test_split(
    df_activity, y,
    test_size=0.2,
    shuffle=True,
    stratify=df_activity['category']
)

# encode categories 
encoder = OneHotEncoder(drop='first', sparse_output=False)
x_train = np.hstack([x_train.drop(['category'], axis='columns'), encoder.fit_transform(x_train['category'].to_frame())])
x_test = np.hstack([x_test.drop(['category'], axis='columns'), encoder.transform(x_test['category'].to_frame())])

# normalizing
x_scaler = StandardScaler()
x_train = x_scaler.fit_transform(x_train)
x_test = x_scaler.transform(x_test)

In [109]:
import random
from random import randint

random.seed(42)



model = LinearRegression()
mse = 1e10

current_model = LinearRegression()
current_model.fit(x_train, y_train)

current_y_predict = current_model.predict(x_test)
current_mse = metrics.mean_squared_error(y_test, current_y_predict)

if current_mse < mse:
    mse = current_mse
    model = current_model


iterations = 50
for alpha in range(1, iterations + 1):
    current_model = Ridge(alpha=alpha/iterations, random_state=randint(0, 4294967295))
    current_model.fit(x_train, y_train)

    current_y_predict = current_model.predict(x_test)
    current_mse = metrics.mean_squared_error(y_test, current_y_predict)

    if current_mse < mse:
        mse = current_mse
        model = current_model

for alpha in range(1, iterations + 1):
    current_model = Lasso(alpha=alpha/iterations, random_state=randint(0, 4294967295))
    current_model.fit(x_train, y_train)

    current_y_predict = current_model.predict(x_test)
    current_mse = metrics.mean_squared_error(y_test, current_y_predict)

    if current_mse < mse:
        mse = current_mse
        model = current_model

for alpha in range(1, iterations + 1):
    for ratio in range(1, iterations + 1):
        current_model = ElasticNet(alpha=alpha/iterations, l1_ratio=ratio/iterations, random_state=randint(0, 4294967295))
        current_model.fit(x_train, y_train)

        current_y_predict = current_model.predict(x_test)
        current_mse = metrics.mean_squared_error(y_test, current_y_predict)

        if current_mse < mse:
            mse = current_mse
            model = current_model


test_predicts = model.predict(x_test)
print(f'TEST:\n\
    MSE = {metrics.mean_squared_error(y_test, test_predicts)}\n\
    MAE = {metrics.mean_absolute_error(y_test, test_predicts)}\n\
    MAPE = {metrics.mean_absolute_percentage_error(y_test, test_predicts) * 100:.1f}%\n\
    R2 = {metrics.r2_score(y_test, test_predicts)}'
)

train_predicts = model.predict(x_train)
print(f'TRAIN:\n\
    MSE = {metrics.mean_squared_error(y_train, train_predicts)}\n\
    MAE = {metrics.mean_absolute_error(y_train, train_predicts)}\n\
    MAPE = {metrics.mean_absolute_percentage_error(y_train, train_predicts) * 100:.1f}%\n\
    R2 = {metrics.r2_score(y_train, train_predicts)}'
)

go.Figure(
    data=[
        go.Histogram(x=test_predicts, name='test'),
        go.Histogram(x=train_predicts, name='train'),
        go.Histogram(x=initial_df['source_attractiveness'], name='target'),
    ],
    layout=dict(title='Конечная модель')
).show()

model

TEST:
    MSE = 0.004924890426792191
    MAE = 0.0536382016048969
    MAPE = 133.7%
    R2 = 0.9091560996312602
TRAIN:
    MSE = 0.004819586544095667
    MAE = 0.05361226803747835
    MAPE = 113.4%
    R2 = 0.9043842734319687


## Final model submit function

In [110]:
def predict_file_with_model(data_path, model):
    df = pd.read_csv(data_path)
    df = setup_cleared_df(df)
    df = setup_df_activity(df)

    x = np.hstack([df.drop(['category'], axis='columns'), encoder.transform(df['category'].to_frame())])
    x = x_scaler.transform(x)

    predict = model.predict(x)

    data = { "source_attractiveness": predict }
    submit = pd.DataFrame(data)
    submit.to_csv('submission.csv', index_label="ID")

def predict(data_path):
    predict_file_with_model(data_path, model)


predict(test_data_path)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i