In [1]:
#2053635任柯睿
import torch 
import numpy as np
import pandas as pd
import json
from datetime import datetime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### **读取数据**
#### 读取数据集tmdb_5000_credits.csv和tmdb_5000_movies.csv并去除无关因素。

In [2]:
np.random.seed(555)
df1=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df2=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
y=df2['vote_average'].astype(float)
ls=['budget', 'genres','original_language','popularity', 'release_date','revenue', 'runtime','status','vote_count']
df=df2.loc[:,ls]
df

### **数据处理**
#### 将genres中属性提取并进行分类，并将release_date等结果为离散值的属性进行独热处理。
#### 之后对缺失数据用平均值进行补充，并对结果进行01正则化，方便训练。

In [3]:
def process_jsoncols(colname):
    jsoncollist=[]
    for x in colname:
        jsoncollist.append(x['name'])
    return jsoncollist

In [4]:
df['genres'] = df['genres'].apply(json.loads, encoding="utf-8")
df['genres'] = df['genres'].apply(process_jsoncols)
df['genres'] = df['genres'].apply(lambda x: ','.join(map(str, x)))

In [5]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month

In [6]:
genres_list = set()
for sstr in df['genres'].str.split(','):
    genres_list = set().union(sstr, genres_list)
genres_list = list(genres_list)
genres_list.remove('')
genres_list

In [7]:
for genres in genres_list:
    df[genres] = df['genres'].str.contains(genres).apply(lambda x:1 if x else 0)

In [8]:
ls=df.columns[df.isnull().sum()>0]
print(ls)
for s in ls:
    meantime=df[s].mean()
    df[s] = df[s].fillna(meantime)

In [9]:
#查看df中是否还有缺失数据
df.isnull().sum()

In [10]:
df.drop(['release_date','genres'],axis=1,inplace=True) 
df=pd.get_dummies(df)
df = (df-df.min())/(df.max()-df.min())
df

In [11]:
x = df.values
x = np.column_stack((np.ones(len(x)),x))

### **获得训练集和测试集**
#### 将数据以0.8:0.2的比例分为训练集和测试集。

In [12]:
#将数据分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

### **评估函数**
#### 计算MAE和RMSE。

In [13]:
def MAE(r,p):
    return sum(np.fabs(r-p))/len(r)

In [14]:
def RMSE(r,p):
    return np.sqrt(sum((r-p)**2)/len(r))

### **开始训练**
#### epoch=5000，lr=0.1，没100轮对预测结果进行评估。
#### 每条数据有67个属性对照，同时在第一列是一列1，用于与w相乘作为b

In [15]:
attr_num=x_train.shape[1]
w=np.zeros(attr_num)
epoch=5000
lr=0.1
print("Start training")
print("total epoch={},learning rate={}".format(epoch,lr))
mae_list=[]
rmse_list=[]
for i in range(epoch):
    e=y_train-np.dot(x_train,w)
    loss=sum(e**2)/len(x_train)
    #print(x_train.T)
    w=w-lr*(-2*np.dot(x_train.T,e)/len(x_train))
    y_test_pre=np.dot(x_test,w)
    mae=MAE(y_test,y_test_pre)
    rmse=RMSE(y_test,y_test_pre)
    if i%100==0:
        mae_list.append(mae)
        rmse_list.append(rmse)
        print("epoch{} MAE={:.3f} RMSE={:.3f}".format(i+1,mae,rmse))

### **绘制MAE-epoch曲线与RMSE-epoch曲线**

In [16]:
#MAE
plt.plot(mae_list)

In [17]:
#RMSE
plt.plot(rmse_list)