## 特征工程

In [1]:
import pandas as pd
import json
from collections import OrderedDict

In [2]:
user = pd.read_csv('../data/BX-Users.csv', sep=';', encoding='latin-1', low_memory=False)
user.columns = ['user_id', 'location', 'age']
user

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [3]:
user.describe()

Unnamed: 0,user_id,age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


In [4]:
user.isnull().sum() / len(user)

user_id     0.000000
location    0.000000
age         0.397199
dtype: float64

In [5]:
user['age'].fillna(user['age'].mean(), inplace=True)
user['age'] = user['age'].astype('int32')

In [6]:
book = pd.read_csv('../data/BX-Books.csv', sep=';',
                   encoding='latin-1', low_memory=False, on_bad_lines='skip')
book.columns = ['isbn', 'title', 'author', 'release_year',
                'publisher', 'image_url_s', 'image_url_m', 'image_url_l']
book

Unnamed: 0,isbn,title,author,release_year,publisher,image_url_s,image_url_m,image_url_l
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [7]:
book.describe()

Unnamed: 0,isbn,title,author,release_year,publisher,image_url_s,image_url_m,image_url_l
count,271360,271360,271359,271360,271358,271360,271360,271357
unique,271360,242135,102023,118,16807,271044,271044,271041
top,195153448,Selected Poems,Agatha Christie,2002,Harlequin,http://images.amazon.com/images/P/185326119X.0...,http://images.amazon.com/images/P/185326119X.0...,http://images.amazon.com/images/P/225307649X.0...
freq,1,27,632,17627,7535,2,2,2


In [8]:
book.isnull().sum() / len(book)

isbn            0.000000
title           0.000000
author          0.000004
release_year    0.000000
publisher       0.000007
image_url_s     0.000000
image_url_m     0.000000
image_url_l     0.000011
dtype: float64

In [9]:
feature_mapping = {val: i + 1 for i, val in enumerate(list(book['isbn'].unique()))}
book_id = book['isbn'].map(feature_mapping)
book_id

0              1
1              2
2              3
3              4
4              5
           ...  
271355    271356
271356    271357
271357    271358
271358    271359
271359    271360
Name: isbn, Length: 271360, dtype: int64

In [10]:
book.insert(loc=0, column='book_id', value=book_id, allow_duplicates=False)
book

Unnamed: 0,book_id,isbn,title,author,release_year,publisher,image_url_s,image_url_m,image_url_l
0,1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,5,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...,...
271355,271356,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,271357,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,271358,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,271359,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [11]:
book['book_id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 271360 entries, 0 to 271359
Series name: book_id
Non-Null Count   Dtype
--------------   -----
271360 non-null  int64
dtypes: int64(1)
memory usage: 2.1 MB


In [12]:
book_ratings = pd.read_csv('../data/BX-Book-Ratings.csv',
                         sep=';', encoding='latin-1', low_memory=False)
book_ratings.columns = ['user_id', 'isbn', 'rating']
book_ratings

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [13]:
book_ratings.describe()

Unnamed: 0,user_id,rating
count,1149780.0,1149780.0
mean,140386.4,2.86695
std,80562.28,3.854184
min,2.0,0.0
25%,70345.0,0.0
50%,141010.0,0.0
75%,211028.0,7.0
max,278854.0,10.0


In [14]:
book_ratings['user_id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1149780 entries, 0 to 1149779
Series name: user_id
Non-Null Count    Dtype
--------------    -----
1149780 non-null  int64
dtypes: int64(1)
memory usage: 8.8 MB


In [15]:
book_ratings['rating'].value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: rating, dtype: int64

In [16]:
book_ratings.isnull().sum() / len(book_ratings)

user_id    0.0
isbn       0.0
rating     0.0
dtype: float64

In [17]:
book_id = book_ratings['isbn'].map(feature_mapping)

In [18]:
book_ratings.insert(loc=1, column='book_id', value=book_id, allow_duplicates=False)
book_ratings

Unnamed: 0,user_id,book_id,isbn,rating
0,276725,2967.0,034545104X,0
1,276726,225817.0,0155061224,5
2,276727,11054.0,0446520802,0
3,276729,246839.0,052165615X,3
4,276729,246840.0,0521795028,6
...,...,...,...,...
1149775,276704,69545.0,1563526298,9
1149776,276706,52541.0,0679447156,0
1149777,276709,15979.0,0515107662,10
1149778,276721,56815.0,0590442449,10


In [19]:
book_ratings = book_ratings.dropna(how='any')
book_ratings

Unnamed: 0,user_id,book_id,isbn,rating
0,276725,2967.0,034545104X,0
1,276726,225817.0,0155061224,5
2,276727,11054.0,0446520802,0
3,276729,246839.0,052165615X,3
4,276729,246840.0,0521795028,6
...,...,...,...,...
1149774,276704,69544.0,0876044011,0
1149775,276704,69545.0,1563526298,9
1149776,276706,52541.0,0679447156,0
1149777,276709,15979.0,0515107662,10


In [20]:
book_ratings.pop('isbn')
book_ratings

Unnamed: 0,user_id,book_id,rating
0,276725,2967.0,0
1,276726,225817.0,5
2,276727,11054.0,0
3,276729,246839.0,3
4,276729,246840.0,6
...,...,...,...
1149774,276704,69544.0,0
1149775,276704,69545.0,9
1149776,276706,52541.0,0
1149777,276709,15979.0,10


In [21]:
book_ratings['book_id'] = book_ratings['book_id'].astype('int64')
book_ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings['book_id'] = book_ratings['book_id'].astype('int64')


Unnamed: 0,user_id,book_id,rating
0,276725,2967,0
1,276726,225817,5
2,276727,11054,0
3,276729,246839,3
4,276729,246840,6
...,...,...,...
1149774,276704,69544,0
1149775,276704,69545,9
1149776,276706,52541,0
1149777,276709,15979,10


In [22]:
book_ratings.to_csv('../data/rating.csv', index=False)

In [23]:
average_rating = book_ratings.groupby('book_id').mean().loc[:, 'rating']
average_rating

book_id
1         0.000000
2         4.928571
3         5.000000
4         4.272727
5         0.000000
            ...   
271356    7.000000
271357    4.000000
271358    0.000000
271359    0.000000
271360    0.000000
Name: rating, Length: 270151, dtype: float64

In [24]:
average_rating.index

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            271351, 271352, 271353, 271354, 271355, 271356, 271357, 271358,
            271359, 271360],
           dtype='int64', name='book_id', length=270151)

In [25]:
average_rating.values

array([0.        , 4.92857143, 5.        , ..., 0.        , 0.        ,
       0.        ])

In [26]:
book.insert(loc=6, column='average_rating', value=0)
book

Unnamed: 0,book_id,isbn,title,author,release_year,publisher,average_rating,image_url_s,image_url_m,image_url_l
0,1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,0,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,0,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,0,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,0,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,5,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,0,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...,...,...
271355,271356,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),0,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,271357,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,0,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,271358,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,0,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,271359,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,0,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [27]:
book.loc[book['book_id'].isin(average_rating.index), 'average_rating'] = average_rating.values
book

Unnamed: 0,book_id,isbn,title,author,release_year,publisher,average_rating,image_url_s,image_url_m,image_url_l
0,1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,0.000000,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,4.928571,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,5.000000,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,4.272727,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,5,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,0.000000,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...,...,...
271355,271356,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),7.000000,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,271357,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,4.000000,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,271358,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,0.000000,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,271359,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,0.000000,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [28]:
label = book_ratings['rating'] >= 5
label = label.astype('int64')

In [29]:
book_ratings.pop('rating')
book_ratings.insert(loc=book_ratings.shape[1], column='label', value=label, allow_duplicates=False)
book_ratings

Unnamed: 0,user_id,book_id,label
0,276725,2967,0
1,276726,225817,1
2,276727,11054,0
3,276729,246839,0
4,276729,246840,1
...,...,...,...
1149774,276704,69544,0
1149775,276704,69545,1
1149776,276706,52541,0
1149777,276709,15979,1


In [30]:
book_ratings['label'].value_counts()

0    663885
1    367251
Name: label, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X = book_ratings.iloc[:, :-1]
y = book_ratings.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [33]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [34]:
train_data['label'].value_counts()

0    464430
1    257365
Name: label, dtype: int64

In [35]:
test_data['label'].value_counts()

0    199455
1    109886
Name: label, dtype: int64

In [36]:
column_names = book_ratings.columns[:-1]
column_names

Index(['user_id', 'book_id'], dtype='object')

In [37]:
user_columns = ['user_id', 'rating']
item_columns = ['book_id']
categorical_columns = ['user_id', 'book_id']
numerical_columns = ['rating']

In [38]:
feature_map = OrderedDict()
feature_map['dataset_id'] = 'bookcrossing'
feature_map['num_fields'] = len(column_names)

In [39]:
book_ratings['user_id'].max()

278854

In [40]:
book_ratings['book_id'].max()

271360

In [41]:
feature_map['feature_specs'] = OrderedDict()
for feature in column_names:
    feature_map['feature_specs'][feature] = {
        'source': 'user' if feature in user_columns else 'item',
        'type': 'numerical' if feature in numerical_columns else 'categorical',
        'vocab_size': 1 if feature in numerical_columns else book_ratings[feature].max() + 1,
        'index': train_data.columns.get_loc(feature)
    }

In [42]:
user.to_csv('../data/user.csv', index=False)
book.to_csv('../data/book.csv', index=False)
train_data.to_csv('../data/train.csv', index=False)
test_data.to_csv('../data/test.csv', index=False)
with open('../data/feature_map.json', 'w') as obj:
    json.dump(feature_map, obj)