#### 因为数据量过大，所以采用抽样的方法对数据进行处理。因为数据在多个文件中，需要分批处理。

#### 因为需要测试的probe文件，包含在数据集中，需要拿到对应的标签。所以处理比较麻烦

In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import BaselineOnly
from surprise.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [2]:
#用来临时存放读取的数据
movies_1 = {}
movies_2 = {}
movies_3 = {}
movies_4 = {}

#存放4个数据集的二维列表
train_1 = []
train_2 = []
train_3 = []
train_4 = []

#### 从txt文件中读取数据，并整理格式,变成两层字典，即{movieid:{customerid:rating}}

In [3]:
def readtrain(txt,movie):
    with open(txt, "r") as f: # 打开文件
        for line in f.readlines():# 读取文件
            line = line.replace('\n', '').replace('\r', '')
            if ':' in line:
                movieid = line.split(':')[0]
                movie.setdefault(movieid,{})
            else:
                customerid = line.split(',')[0]
                rating = line.split(',')[1]
                movie[movieid][customerid] = rating

#### 将数据从两层字典即{movieid:{customerid:rating}}转化成pandas的dataframe，即[[userid,movieid,rating]]

In [4]:
def trainlist(train,movie):
    for movieid,customer in movie.items():
        for customerid,rating in customer.items():
            data = []
            data.append(customerid)
            data.append(movieid)
            data.append(rating)
            train.append(data)
        
    train = pd.DataFrame(train)
    train.columns = ['userId', 'movieId',  'rating']
    return train

#### 读取数据并处理数据格式阶段

In [5]:
#读取数据集1，并处理数据
readtrain('combined_data_1.txt',movies_1)
train_1 = trainlist(train_1,movies_1)
print("数据集1共有"+str(len(train_1))+"条数据")

数据集1共有24053764条数据


In [6]:
#读取数据集2，并处理数据
readtrain('combined_data_2.txt',movies_2)
train_2 = trainlist(train_2,movies_2)
print("数据集2共有"+str(len(train_2))+"条数据")

数据集2共有26977591条数据


In [7]:
#读取数据集3，并处理数据
readtrain('combined_data_3.txt',movies_3)
train_3 = trainlist(train_3,movies_3)
print("数据集3共有"+str(len(train_3))+"条数据")

数据集3共有22601629条数据


In [8]:
#读取数据集4，并处理数据
readtrain('combined_data_4.txt',movies_4)
train_4 = trainlist(train_4,movies_4)
print("数据集4共有"+str(len(train_4))+"条数据")

数据集4共有26847523条数据


In [9]:
#读取需要测试的数据集，即probe数据集，并处理数据，同上面过程相似，只不过少一个rating字段，需要单独计算
movies_probe = {}
def readtest(txt):
    with open(txt, "r") as f: # 打开文件
        for line in f.readlines():# 读取文件
            line = line.replace('\n', '').replace('\r', '')
            if ':' in line:
                movieid = line.split(':')[0]
                movies_probe.setdefault(movieid,[])
            else:
                customerid = line.split(',')[0]
                movies_probe[movieid].append(customerid)
                
readtest('probe.txt')              
test = []
for movieid,customer in movies_probe.items():
    for customerid in customer:
        data = []
        data.append(customerid)
        data.append(movieid)
        test.append(data)

test = pd.DataFrame(test)
test.columns = ['userId', 'movieId']
print("probe文件共有"+str(len(test))+"数据")

probe文件共有1408395数据


#### 因为要将Probe作为测试集，所以打算把数据集中属于probe的部分剔除，同时获取probe中的标签，计算RMSE

In [10]:
#设置标志位，拼接dataframe，根据是否有空行能识别出是否为probe中的数据
test['label'] = '1'

##### 查询是否有空值，如果有进行填充，避免影响数据分割

In [11]:
#查看数据集1是否有空值
sum(train_1.isnull().any())

0

In [12]:
#查看数据集2是否有空值
sum(train_2.isnull().any())

0

In [13]:
##查看数据集3是否有空值
sum(train_3.isnull().any())

0

In [14]:
#查看数据集4是否有空值
sum(train_4.isnull().any())

0

##### 矩阵拼接并删除空值。（剩下的都是probe里面的数据）

In [15]:
test_1 = pd.merge(train_1, test, how='left', on=['userId','movieId'])
print("删除空值前共有"+str(len(test_1))+"条数据")
test_1.dropna(axis=0, how='any', inplace=True)
print("删除空值后共有"+str(len(test_1))+"条数据")

删除空值前共有24053764条数据
删除空值后共有577213条数据


In [16]:
test_2 = pd.merge(train_2, test, how='left', on=['userId','movieId'])
print("删除空值前共有"+str(len(test_2))+"条数据")
test_2.dropna(axis=0, how='any', inplace=True)
print("删除空值后共有"+str(len(test_2))+"条数据")

删除空值前共有26977591条数据
删除空值后共有352289条数据


In [17]:
test_3 = pd.merge(train_3, test, how='left', on=['userId','movieId'])
print("删除空值前共有"+str(len(test_3))+"条数据")
test_3.dropna(axis=0, how='any', inplace=True)
print("删除空值后共有"+str(len(test_3))+"条数据")

删除空值前共有22601629条数据
删除空值后共有245903条数据


In [18]:
test_4 = pd.merge(train_4, test, how='left', on=['userId','movieId'])
print("删除空值前共有"+str(len(test_4))+"条数据")
test_4.dropna(axis=0, how='any', inplace=True)
print("删除空值后共有"+str(len(test_4))+"条数据")

删除空值前共有26847523条数据
删除空值后共有232990条数据


#### 将得到标签的测试集数据拼接起来

In [30]:
testset = test_1
testset = testset.append(test_2, ignore_index=True)
testset = testset.append(test_3, ignore_index=True)
testset = testset.append(test_4, ignore_index=True)

In [31]:
testset = testset.drop('label',axis=1)
labels = testset['rating']

#### 训练集太大，内存遇到不够用的情况，抽取一部分数据出来作为训练集 

In [21]:
trainset_temp_1,drop_set = train_test_split(train_1, test_size=0.95, train_size=0.05, random_state=11, shuffle=True)
trainset_temp_2,drop_set = train_test_split(train_2, test_size=0.95, train_size=0.05, random_state=11, shuffle=True)
trainset_temp_3,drop_set = train_test_split(train_3, test_size=0.95, train_size=0.05, random_state=11, shuffle=True)
trainset_temp_4,drop_set = train_test_split(train_4, test_size=0.95, train_size=0.05, random_state=11, shuffle=True)

In [32]:
trainset = trainset_temp_1.reset_index(drop=True).head()
trainset = trainset.append(trainset_temp_2, ignore_index=True)
trainset = trainset.append(trainset_temp_3, ignore_index=True)
trainset = trainset.append(trainset_temp_4, ignore_index=True)

#### 查看dataframe格式的训练集和测试集

In [33]:
len(trainset)

3821341

In [34]:
len(testset)

1408395

In [35]:
trainset

Unnamed: 0,userId,movieId,rating
0,2425195,2346,3
1,1853520,3198,3
2,1857050,2216,4
3,665042,1865,2
4,1310403,3254,3
...,...,...,...
3821336,2246180,14909,4
3821337,2108102,13847,5
3821338,136918,16063,4
3821339,2640550,17395,4


In [36]:
testset

Unnamed: 0,userId,movieId,rating
0,30878,1,4
1,2647871,1,4
2,1283744,1,3
3,2488120,1,5
4,317050,1,5
...,...,...,...
1408390,829192,17770,3
1408391,54864,17770,1
1408392,533482,17770,3
1408393,1196966,17770,2


#### 将训练集通过surprise工具箱的方法进行解析，得到适合推荐算法训练的训练集trainset

In [37]:
reader = Reader(line_format='user item rating', sep='\t')
trainset = Dataset.load_from_df(trainset, reader=reader)
trainset = trainset.build_full_trainset()

#### 拟合Baseline模型

In [39]:
algo = BaselineOnly()
algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1c6cb260e88>

#### 使用testset数据集计算RMSE

In [40]:
prediction = []

#### 使用推荐算法的predict方法预测用户评分，并存储在列表中

In [41]:
#查看predict返回值
data = testset.iloc[1]
algo.predict(data[0], data[1], r_ui=None, clip=True, verbose=False)

Prediction(uid='2647871', iid='1', r_ui=None, est=3.2536821951037833, details={'was_impossible': False})

In [42]:
#得到某一个评分
algo.predict(data[0], data[1], r_ui=None, clip=True, verbose=False)[3]

3.2536821951037833

In [43]:
#将预测值存起来
for i in range(len(testset)):
    data = testset.iloc[i]
    prediction.append(algo.predict(data[0], data[1], r_ui=None, clip=True, verbose=False)[3])

In [50]:
labels = testset['rating']

In [51]:
print("均方误差RMSE为："+str(np.sqrt(mean_squared_error(prediction, labels))))

均方误差RMSE为：1.0547529794971073
