In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [3]:
ratings = pd.read_csv("rating.csv")
anime = pd.read_csv("anime.csv")

In [4]:
print(ratings.head())
print(anime.head())

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       355      -1
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262 

In [5]:
df = ratings.merge(anime, on="anime_id", how="left")
print(df.head())

   user_id  anime_id  rating_x               name  \
0        1        20        -1             Naruto   
1        1        24        -1      School Rumble   
2        1        79        -1           Shuffle!   
3        1       226        -1         Elfen Lied   
4        1       355        -1  Shakugan no Shana   

                                               genre type episodes  rating_y  \
0  Action, Comedy, Martial Arts, Shounen, Super P...   TV      220      7.81   
1                   Comedy, Romance, School, Shounen   TV       26      8.06   
2  Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...   TV       24      7.31   
3  Action, Drama, Horror, Psychological, Romance,...   TV       13      7.85   
4  Action, Drama, Fantasy, Romance, School, Super...   TV       24      7.74   

    members  
0  683297.0  
1  178553.0  
2  158772.0  
3  623511.0  
4  297058.0  


In [6]:
df['num_words_in_name'] = df['name'].astype(str).apply(lambda x: len(x.split()))

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df['genre'] = df['genre'].astype(str).apply(lambda x: [g.strip() for g in x.split(',')])

# Создаём one-hot encoding
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_ohe = pd.DataFrame(mlb.fit_transform(df['genre']), columns=mlb.classes_, index=df.index)

# Объединяем обратно с df
df = pd.concat([df, genre_ohe], axis=1)

In [9]:
print(df.head())

   user_id  anime_id  rating_x               name  \
0        1        20        -1             Naruto   
1        1        24        -1      School Rumble   
2        1        79        -1           Shuffle!   
3        1       226        -1         Elfen Lied   
4        1       355        -1  Shakugan no Shana   

                                               genre type episodes  rating_y  \
0  [Action, Comedy, Martial Arts, Shounen, Super ...   TV      220      7.81   
1                 [Comedy, Romance, School, Shounen]   TV       26      8.06   
2  [Comedy, Drama, Ecchi, Fantasy, Harem, Magic, ...   TV       24      7.31   
3  [Action, Drama, Horror, Psychological, Romance...   TV       13      7.85   
4  [Action, Drama, Fantasy, Romance, School, Supe...   TV       24      7.74   

    members  num_words_in_name  ...  Slice of Life  Space  Sports  \
0  683297.0                  1  ...              0      0       0   
1  178553.0                  2  ...              0      0     

In [10]:
df = pd.get_dummies(df, columns=['type'], prefix='type')

In [11]:
print(df.head())

   user_id  anime_id  rating_x               name  \
0        1        20        -1             Naruto   
1        1        24        -1      School Rumble   
2        1        79        -1           Shuffle!   
3        1       226        -1         Elfen Lied   
4        1       355        -1  Shakugan no Shana   

                                               genre episodes  rating_y  \
0  [Action, Comedy, Martial Arts, Shounen, Super ...      220      7.81   
1                 [Comedy, Romance, School, Shounen]       26      8.06   
2  [Comedy, Drama, Ecchi, Fantasy, Harem, Magic, ...       24      7.31   
3  [Action, Drama, Horror, Psychological, Romance...       13      7.85   
4  [Action, Drama, Fantasy, Romance, School, Supe...       24      7.74   

    members  num_words_in_name  Action  ...  Vampire  Yaoi  Yuri  nan  \
0  683297.0                  1       1  ...        0     0     0    0   
1  178553.0                  2       0  ...        0     0     0    0   
2  158772.0

In [12]:
exclude_cols = ['name', 'genre','rating_x','user_id','anime_id'] 
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
X = X.drop(columns=['nan'])

In [13]:
print(X.head())

  episodes  rating_y   members  num_words_in_name  Action  Adventure  Cars  \
0      220      7.81  683297.0                  1       1          0     0   
1       26      8.06  178553.0                  2       0          0     0   
2       24      7.31  158772.0                  1       0          0     0   
3       13      7.85  623511.0                  2       1          0     0   
4       24      7.74  297058.0                  3       1          0     0   

   Comedy  Dementia  Demons  ...  Thriller  Vampire  Yaoi  Yuri  type_Movie  \
0       1         0       0  ...         0        0     0     0       False   
1       1         0       0  ...         0        0     0     0       False   
2       1         0       0  ...         0        0     0     0       False   
3       0         0       0  ...         0        0     0     0       False   
4       0         0       0  ...         0        0     0     0       False   

   type_Music  type_ONA  type_OVA  type_Special  type_TV

In [14]:
X = df.drop(columns=['rating_y','genre', 'name','user_id','anime_id','rating_x'])  # Признаки
Y = df['rating_y']  # Целевая переменная


In [None]:
print(X[X.apply(lambda row: row.astype(str).str.contains('Unknown').any(), axis=1)])

In [None]:
X = X[~X.apply(lambda row: row.astype(str).str.contains('Unknown').any(), axis=1)]
Y = Y[X.index]

In [None]:
print(X.dtypes)

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [23]:
print("Размер тренировочного набора X:", X_train.shape)
print("Размер тестового набора X:", X_test.shape)
print("Размер тренировочного набора Y:", Y_train.shape)
print("Размер тестового набора Y:", Y_test.shape)

Размер тренировочного набора X: (5813420, 53)
Размер тестового набора X: (1453355, 53)
Размер тренировочного набора Y: (5813420,)
Размер тестового набора Y: (1453355,)


In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [29]:
model = LinearRegression()

# 2. Обучаем модель на тренировочных данных
model.fit(X_train, Y_train)

# 3. Делаем предсказания на тестовых данных
Y_pred = model.predict(X_test)

# 4. Оценка модели

# R^2 (коэффициент детерминации)
r2 = r2_score(Y_test, Y_pred)

# Средняя абсолютная ошибка (MAE)
mae = mean_absolute_error(Y_test, Y_pred)

# Среднеквадратичная ошибка (MSE)
mse = mean_squared_error(Y_test, Y_pred)

# Корень из MSE (RMSE)
rmse = mean_squared_error(Y_test, Y_pred, squared=False)

# Печатаем результаты
print(f'R^2: {r2}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

# 5. Посмотрим на коэффициенты модели (веса признаков)
print(f'Коэффициенты модели: {model.coef_}')
print(f'Смещение (intercept): {model.intercept_}')

ValueError: could not convert string to float: 'Unknown'