## 特征工程

In [1]:
import pandas as pd
import json
from collections import OrderedDict

In [2]:
user = pd.read_csv('../data/BX-Users.csv', sep=';', encoding='ISO-8859-1', escapechar='\\', quotechar='"')
user.columns = ['user_id', 'location', 'age']
user

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [3]:
user.describe()

Unnamed: 0,user_id,age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


In [4]:
user.isnull().sum() / len(user)

user_id     0.000000
location    0.000000
age         0.397199
dtype: float64

In [5]:
user['age'].fillna(user['age'].mean(), inplace=True)
user['age'] = user['age'].astype('int32')

In [6]:
book = pd.read_csv('../data/BX-Books.csv', sep=';',
                   encoding='ISO-8859-1', escapechar='\\', quotechar='"')
book.columns = ['isbn', 'title', 'author', 'release_year',
                'publisher', 'image_url_s', 'image_url_m', 'image_url_l']
book

Unnamed: 0,isbn,title,author,release_year,publisher,image_url_s,image_url_m,image_url_l
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [7]:
book.describe()

Unnamed: 0,release_year
count,271379.0
mean,1959.75605
std,258.011363
min,0.0
25%,1989.0
50%,1995.0
75%,2000.0
max,2050.0


In [8]:
book.isnull().sum() / len(book)

isbn            0.000000
title           0.000000
author          0.000004
release_year    0.000000
publisher       0.000007
image_url_s     0.000000
image_url_m     0.000000
image_url_l     0.000000
dtype: float64

In [9]:
feature_mapping = {val: i + 1 for i, val in enumerate(list(book['isbn'].unique()))}
book_id = book['isbn'].map(feature_mapping)
book_id

0              1
1              2
2              3
3              4
4              5
           ...  
271374    271375
271375    271376
271376    271377
271377    271378
271378    271379
Name: isbn, Length: 271379, dtype: int64

In [10]:
book.insert(loc=0, column='book_id', value=book_id, allow_duplicates=False)
book

Unnamed: 0,book_id,isbn,title,author,release_year,publisher,image_url_s,image_url_m,image_url_l
0,1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,5,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...,...
271374,271375,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271375,271376,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271376,271377,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271377,271378,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [11]:
book['book_id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 271379 entries, 0 to 271378
Series name: book_id
Non-Null Count   Dtype
--------------   -----
271379 non-null  int64
dtypes: int64(1)
memory usage: 2.1 MB


In [12]:
book_ratings = pd.read_csv('../data/BX-Book-Ratings.csv',
                           sep=';', encoding='ISO-8859-1', escapechar='\\', quotechar='"')
book_ratings.columns = ['user_id', 'isbn', 'rating']
book_ratings

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [13]:
book_ratings.describe()

Unnamed: 0,user_id,rating
count,1149780.0,1149780.0
mean,140386.4,2.86695
std,80562.28,3.854184
min,2.0,0.0
25%,70345.0,0.0
50%,141010.0,0.0
75%,211028.0,7.0
max,278854.0,10.0


In [14]:
book_ratings['user_id'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1149780 entries, 0 to 1149779
Series name: user_id
Non-Null Count    Dtype
--------------    -----
1149780 non-null  int64
dtypes: int64(1)
memory usage: 8.8 MB


In [15]:
book_ratings['rating'].value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: rating, dtype: int64

In [16]:
book_ratings.isnull().sum() / len(book_ratings)

user_id    0.0
isbn       0.0
rating     0.0
dtype: float64

In [17]:
book_ratings = book_ratings.loc[book_ratings['rating'] != 0, :]
book_ratings

Unnamed: 0,user_id,isbn,rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6
...,...,...,...
1149773,276704,0806917695,5
1149775,276704,1563526298,9
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [18]:
book_id = book_ratings['isbn'].map(feature_mapping)

In [19]:
book_ratings.insert(loc=1, column='book_id', value=book_id, allow_duplicates=False)
book_ratings

Unnamed: 0,user_id,book_id,isbn,rating
1,276726,225830.0,0155061224,5
3,276729,246855.0,052165615X,3
4,276729,246856.0,0521795028,6
6,276736,,3257224281,8
7,276737,,0600570967,6
...,...,...,...,...
1149773,276704,69545.0,0806917695,5
1149775,276704,69548.0,1563526298,9
1149777,276709,15980.0,0515107662,10
1149778,276721,56818.0,0590442449,10


In [20]:
book_ratings = book_ratings.dropna(how='any')
book_ratings

Unnamed: 0,user_id,book_id,isbn,rating
1,276726,225830.0,0155061224,5
3,276729,246855.0,052165615X,3
4,276729,246856.0,0521795028,6
8,276744,9296.0,038550120X,7
16,276747,4780.0,0060517794,9
...,...,...,...,...
1149771,276704,882.0,0743211383,7
1149773,276704,69545.0,0806917695,5
1149775,276704,69548.0,1563526298,9
1149777,276709,15980.0,0515107662,10


In [21]:
book_ratings.pop('isbn')
book_ratings

Unnamed: 0,user_id,book_id,rating
1,276726,225830.0,5
3,276729,246855.0,3
4,276729,246856.0,6
8,276744,9296.0,7
16,276747,4780.0,9
...,...,...,...
1149771,276704,882.0,7
1149773,276704,69545.0,5
1149775,276704,69548.0,9
1149777,276709,15980.0,10


In [22]:
book_ratings.loc[:, 'book_id'] = book_ratings['book_id'].astype('int64')
book_ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_ratings.loc[:, 'book_id'] = book_ratings['book_id'].astype('int64')


Unnamed: 0,user_id,book_id,rating
1,276726,225830,5
3,276729,246855,3
4,276729,246856,6
8,276744,9296,7
16,276747,4780,9
...,...,...,...
1149771,276704,882,7
1149773,276704,69545,5
1149775,276704,69548,9
1149777,276709,15980,10


In [23]:
book_ratings.to_csv('../data/rating.csv', index=False)

In [24]:
average_rating = book_ratings.groupby('book_id').mean().loc[:, 'rating'].round(1)
average_rating

book_id
2          7.7
3          7.5
4          7.8
6          8.2
7          8.0
          ... 
271366    10.0
271372     7.0
271374     5.0
271375     7.0
271376     4.0
Name: rating, Length: 149842, dtype: float64

In [25]:
average_rating.index

Int64Index([     2,      3,      4,      6,      7,      8,     10,     11,
                13,     14,
            ...
            271351, 271352, 271353, 271354, 271355, 271366, 271372, 271374,
            271375, 271376],
           dtype='int64', name='book_id', length=149842)

In [26]:
average_rating.values

array([7.7, 7.5, 7.8, ..., 5. , 7. , 4. ])

In [27]:
book.insert(loc=6, column='average_rating', value=0)
book

Unnamed: 0,book_id,isbn,title,author,release_year,publisher,average_rating,image_url_s,image_url_m,image_url_l
0,1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,0,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,0,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,0,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,0,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,5,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,0,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...,...,...
271374,271375,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),0,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271375,271376,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,0,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271376,271377,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,0,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271377,271378,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,0,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [28]:
book.loc[book['book_id'].isin(average_rating.index), 'average_rating'] = average_rating.values
book

Unnamed: 0,book_id,isbn,title,author,release_year,publisher,average_rating,image_url_s,image_url_m,image_url_l
0,1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,0.0,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,7.7,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,7.5,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,7.8,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,5,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,0.0,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...,...,...
271374,271375,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),7.0,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271375,271376,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,4.0,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271376,271377,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,0.0,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271377,271378,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,0.0,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [29]:
label = book_ratings['rating'] >= 6
label = label.astype('int64')

In [30]:
book_ratings.pop('rating')
book_ratings.insert(loc=book_ratings.shape[1], column='label', value=label, allow_duplicates=False)
book_ratings

Unnamed: 0,user_id,book_id,label
1,276726,225830,0
3,276729,246855,0
4,276729,246856,1
8,276744,9296,1
16,276747,4780,1
...,...,...,...
1149771,276704,882,1
1149773,276704,69545,0
1149775,276704,69548,1
1149777,276709,15980,1


In [31]:
book_ratings['label'].value_counts()

1    321906
0     61946
Name: label, dtype: int64

In [32]:
book_ratings['user_id'].max()

278854

In [33]:
book_ratings['book_id'].max()

271376

In [34]:
user_behavior = book_ratings.loc[book_ratings['label'] == 1, ['user_id', 'book_id']]
user_behavior

Unnamed: 0,user_id,book_id
4,276729,246856
8,276744,9296
16,276747,4780
19,276747,1837
20,276747,6277
...,...,...
1149761,276704,11887
1149771,276704,882
1149775,276704,69548
1149777,276709,15980


In [35]:
user_behavior['user_id'].nunique()

61289

In [36]:
user_behavior['book_id'].nunique()

129020

In [37]:
user_behavior.to_csv('../data/user_behavior.csv', index=False)

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X = book_ratings.iloc[:, :-1]
y = book_ratings.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [41]:
train_data['label'].value_counts()

1    225385
0     43311
Name: label, dtype: int64

In [42]:
test_data['label'].value_counts()

1    96521
0    18635
Name: label, dtype: int64

In [43]:
column_names = book_ratings.columns[:-1]
column_names

Index(['user_id', 'book_id'], dtype='object')

In [44]:
user_columns = ['user_id']
item_columns = ['book_id']
categorical_columns = ['user_id', 'book_id']
numerical_columns = []
vocab_size_map = {
    'user_id': 278858,
    'book_id': 271379
}

In [45]:
feature_map = OrderedDict()
feature_map['dataset_id'] = 'bookcrossing'
feature_map['num_fields'] = len(column_names)

In [46]:
feature_map['feature_specs'] = OrderedDict()
for feature in column_names:
    feature_map['feature_specs'][feature] = {
        'source': 'user' if feature in user_columns else 'item',
        'type': 'numerical' if feature in numerical_columns else 'categorical',
        'vocab_size': 1 if feature in numerical_columns else vocab_size_map[feature],
        'index': train_data.columns.get_loc(feature)
    }

In [47]:
user.to_csv('../data/user.csv', index=False)
book.to_csv('../data/book.csv', index=False)
train_data.to_csv('../data/train.csv', index=False)
test_data.to_csv('../data/test.csv', index=False)
with open('../data/feature_map.json', 'w') as obj:
    json.dump(feature_map, obj)