## Import Library

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import model_selection

## 1. MovieLens Dataset

#### Import User

In [2]:
user_df_ml = pd.read_csv('../../data/movielens/u.user', sep='|', header=None, names=['Index', 'Age', 'Gender', 'Occupation', 'Zip code'])

In [3]:
user_df_ml

Unnamed: 0,Index,Age,Gender,Occupation,Zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [4]:
total_user_ml = user_df_ml['Index'].nunique()

In [5]:
total_user_ml

943

#### Import Genre

In [6]:
genre_df_ml = pd.read_csv('../../data/movielens/u.genre', sep='|', header=None)

#### Import Movie

In [7]:
movie_df_ml = pd.read_csv('../../data/movielens/u.item', sep='|', header=None, encoding='latin-1', names=['Index', 'Title', 'Release', 'The NaN Column', 'Imdb'] + genre_df_ml[0].tolist())

In [8]:
movie_df_ml

Unnamed: 0,Index,Title,Release,The NaN Column,Imdb,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
total_movie_ml = movie_df_ml['Index'].nunique()

In [10]:
total_movie_ml

1682

#### Format Data

In [11]:
def format_movie_user_rating(data, nb_users, nb_movies):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return np.array(new_data)

#### Format Train Set

In [12]:
training_df_ml = pd.read_csv('../../data/movielens/u1.base', sep='\t', header=None, names=['User Id', 'Movie Id', 'Rating', 'Timestamp'])

In [13]:
training_df_ml

Unnamed: 0,User Id,Movie Id,Rating,Timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


In [14]:
training_set_ml = format_movie_user_rating(np.array(training_df_ml), total_user_ml, total_movie_ml)

In [15]:
training_set_ml

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [16]:
training_set_df_ml = pd.DataFrame(training_set_ml)

In [17]:
training_set_df_ml = training_set_df_ml.rename(index=user_df_ml['Index'].to_dict(), columns=movie_df_ml['Index'].to_dict())

In [18]:
training_set_df_ml

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Save Train Set Into CSV

In [19]:
training_set_df_ml.to_csv(path_or_buf='../../data/formatted_data/training_set_movie_lens.csv')

#### Format Test Set

In [20]:
test_df_ml = pd.read_csv('../../data/movielens/u1.test', sep='\t', header=None, names=['User Id', 'Movie Id', 'Rating', 'Timestamp'])

In [21]:
test_df_ml

Unnamed: 0,User Id,Movie Id,Rating,Timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198
...,...,...,...,...
19995,458,648,4,886395899
19996,458,1101,4,886397931
19997,459,934,3,879563639
19998,460,10,3,882912371


In [22]:
test_set_df_ml = format_movie_user_rating(np.array(test_df_ml), total_user_ml, total_movie_ml)

In [23]:
test_set_df_ml = pd.DataFrame(test_set_df_ml)

In [24]:
test_set_df_ml

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
test_set_df_ml = test_set_df_ml.rename(index=user_df_ml['Index'].to_dict(), columns=movie_df_ml['Index'].to_dict())

In [26]:
test_set_df_ml

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Save Test Set Into CSV

In [27]:
test_set_df_ml.to_csv(path_or_buf='../../data/formatted_data/test_set_movie_lens.csv')

## 2. Book Crossing

#### Import Book

In [28]:
book_df_bx = pd.read_csv("../../data-raw/book-rating/BX-Books.csv", sep=";", encoding = "ISO-8859-1", on_bad_lines='skip')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [29]:
book_df_bx.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


#### Drop Image Columns

In [30]:
book_df_bx = book_df_bx.drop(labels=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [31]:
book_df_bx.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


#### Import User

In [32]:
user_df_bx = pd.read_csv("../../data-raw/book-rating/BX-Users.csv", sep=";", encoding = "ISO-8859-1", on_bad_lines='skip')

In [33]:
user_df_bx.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


#### Import User Book Rating

In [34]:
book_user_rating_bx = pd.read_csv("../../data-raw/book-rating/BX-Book-Ratings.csv", sep=";", encoding = "ISO-8859-1", on_bad_lines='skip')

In [35]:
book_user_rating_bx.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [36]:
book_user_rating_bx.shape

(1149780, 3)

#### Get Top 1000 User Who Gave Ratings

In [37]:
most_rated_user = book_user_rating_bx['User-ID'].value_counts()[:1000].index.values

In [38]:
most_rated_user[:10]

array([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
       110973, 235105], dtype=int64)

#### Get Top 1000 Book Which Got Ratings

In [39]:
most_rated_book = book_user_rating_bx['ISBN'].value_counts()[:1000].index.values

In [40]:
most_rated_book[:10]

array(['0971880107', '0316666343', '0385504209', '0060928336',
       '0312195516', '044023722X', '0679781587', '0142001740',
       '067976402X', '0671027360'], dtype=object)

#### Filter User By Top 1000 User Who Gave Ratings

In [41]:
user_df_bx = user_df_bx[user_df_bx['User-ID'].isin(most_rated_user)]

In [42]:
user_df_bx = user_df_bx.reset_index(drop=True)

In [43]:
user_df_bx

Unnamed: 0,User-ID,Location,Age
0,254,"minneapolis, minnesota, usa",24.0
1,2033,"omaha, nebraska, usa",27.0
2,2276,"niskayuna, new york, usa",46.0
3,2766,"frisco, texas, usa",42.0
4,2977,"richland, washington, usa",25.0
...,...,...,...
995,277427,"gilbert, arizona, usa",48.0
996,277478,"schiedam, zuid-holland, netherlands",31.0
997,277639,"forsyth, montana, usa",48.0
998,278188,"lake george, new york, usa",34.0


In [44]:
total_user_bx = user_df_bx['User-ID'].nunique()

In [45]:
total_user_bx

1000

#### Save Filtered User To CSV

In [46]:
user_df_bx.to_csv(path_or_buf='../../data/book-crossing/BX-Users-1K.csv', index=False)

#### Filter Book By Top 1000 Book Which Got Ratings

In [47]:
book_df_bx = book_df_bx[book_df_bx['ISBN'].isin(most_rated_book)]

In [48]:
book_df_bx = book_df_bx.reset_index(drop=True)

In [49]:
book_df_bx

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0440234743,The Testament,John Grisham,1999,Dell
1,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume
2,0971880107,Wild Animus,Rich Shapero,2004,Too Far
3,0345402871,Airframe,Michael Crichton,1997,Ballantine Books
4,0345417623,Timeline,MICHAEL CRICHTON,2000,Ballantine Books
...,...,...,...,...,...
987,0425172996,Vector,Robin Cook,2000,Berkley Publishing Group
988,0446604402,The Laws of Our Fathers,Scott Turow,1997,Warner Books
989,0515135739,Eleventh Hour: An FBI Thriller (FBI Thriller (...,Catherine Coulter,2004,Jove Books
990,0743418131,Deck the Halls (Holiday Classics),Mary Higgins Clark,2002,Pocket


In [50]:
total_book_bx = book_df_bx['ISBN'].nunique()

In [51]:
total_book_bx

992

In [52]:
book_df_bx.to_csv(path_or_buf='../../data/book-crossing/BX-Books-1K.csv', index=False)

#### Filter Rating By ISBN In Book DataFrame

In [53]:
book_user_rating_bx = book_user_rating_bx[book_user_rating_bx['ISBN'].isin(book_df_bx['ISBN'].values) & book_user_rating_bx['User-ID'].isin(user_df_bx['User-ID'].values)]

In [54]:
book_user_rating_bx = book_user_rating_bx.reset_index(drop=True)

In [55]:
book_user_rating_bx

Unnamed: 0,User-ID,ISBN,Book-Rating
0,277427,002542730X,10
1,277427,006092988X,0
2,277427,0060930535,0
3,277427,0060934417,0
4,277427,0061009059,9
...,...,...,...
62063,276680,0446670251,0
62064,276680,0452283205,7
62065,276680,0670030643,0
62066,276680,0679731725,0


#### Save Filtered Rating To CSV

In [56]:
book_user_rating_bx.to_csv(path_or_buf='../../data/book-crossing/BX-Ratings-62K.csv', index=False)

#### Split Train and Test Set

In [57]:
training_set_df_bx, test_set_df_bx = model_selection.train_test_split(book_user_rating_bx, train_size=0.8)

In [58]:
training_set_df_bx.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
29073,130474,373484003,0
30583,136205,446609323,0
25886,114868,679731725,0
52907,236058,671028375,0
49287,225810,446365505,0


In [59]:
training_set_df_bx.shape

(49654, 3)

In [60]:
test_set_df_bx.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
47827,219546,451153553,7
48377,224138,156007754,10
44971,204864,553282476,0
37405,168064,553572997,0
45249,206074,515135062,0


In [61]:
test_set_df_bx.shape

(12414, 3)

#### Format Data

In [62]:
isbn_id_dict = {v: k for k, v in book_df_bx['ISBN'].to_dict().items()}
user_id_dict = user_df_bx['User-ID'].to_dict()

def format_book_user_rating(data, total_user, total_book):
    new_data = []
    for i in range(1, total_user):
        id_users = user_id_dict[i]
        id_book = data[data['User-ID'] == id_users]['ISBN'].values
        id_rating = data[data['User-ID'] == id_users]['Book-Rating'].values
        id_book = [isbn_id_dict[i] for i in id_book]
        ratings = np.zeros(total_book)
        ratings[id_book] = id_rating
        new_data.append(list(ratings))
    return np.array(new_data)

#### Format Train Set

In [63]:
train_set_bx = format_book_user_rating(training_set_df_bx, total_user_bx, total_book_bx)

In [64]:
train_set_df_bx = pd.DataFrame(train_set_bx)

In [65]:
train_set_df_bx

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,982,983,984,985,986,987,988,989,990,991
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,6.0,0.0,0.0,10.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
train_set_df_bx = train_set_df_bx.rename(index=user_df_bx['User-ID'].to_dict(), columns=book_df_bx['ISBN'].to_dict())

#### Save Train Set Into CSV

In [67]:
train_set_df_bx.to_csv(path_or_buf='../../data/formatted_data/train_set_book_crossing.csv')

##### Format Test Set

In [68]:
test_set_bx = format_book_user_rating(test_set_df_bx, total_user_bx, total_book_bx)

In [69]:
test_set_df_bx = pd.DataFrame(test_set_bx)

In [70]:
test_set_df_bx

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,982,983,984,985,986,987,988,989,990,991
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
test_set_df_bx = test_set_df_bx.rename(index=user_df_bx['User-ID'].to_dict(), columns=book_df_bx['ISBN'].to_dict())

In [72]:
test_set_df_bx

Unnamed: 0,0440234743,0452264464,0971880107,0345402871,0345417623,0446310786,0449005615,0671888587,0553582747,0425182908,...,0425178765,0449223604,0345444884,0060008032,0446608602,0425172996,0446604402,0515135739,0743418131,0440217490
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Save Test Set Into CSV

In [73]:
test_set_df_bx.to_csv(path_or_buf='../../data/formatted_data/test_set_book_crossing.csv')