In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split
#数据集加载##############
train_data = pd.read_csv('./85book_Rating/train.csv', index_col=None, sep=',')
test_data = pd.read_csv('./85book_Rating/test.csv', index_col=None, sep=',')

#读取时需要先将下面两个文件重新保存为UTF-8(BOM)格式
book_data = pd.read_csv('./85book_Rating/book.csv', index_col=None, sep=',')
user_data = pd.read_csv('./85book_Rating/user.csv', index_col=None, sep=',')
user_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [2]:
def GetAgeGroup(user):
    if pd.isnull(user.Age):
        return 4
    elif user.Age < 18:
        return 0
    elif user.Age <= 24:
        return 1
    elif user.Age <= 34:
        return 2
    elif user.Age <= 44:
        return 3
    elif user.Age <= 49:
        return 5
    elif user.Age <= 55:
        return 6
    else:
        return 7
user_data['Age_Encoding'] = user_data.apply(GetAgeGroup, axis=1)
user_data.head(5)

Unnamed: 0,User-ID,Location,Age,Age_Encoding
0,1,"nyc, new york, usa",,4
1,2,"stockton, california, usa",18.0,1
2,3,"moscow, yukon territory, russia",,4
3,4,"porto, v.n.gaia, portugal",17.0,0
4,5,"farnborough, hants, united kingdom",,4


In [3]:
#手动填补，否则下方会报错
user_data.iloc[29419, list(user_data.columns).index('Location')] = 'st. louis, missouri, usa'
user_data.iloc[134376, list(user_data.columns).index('Location')] = 'lawrenceville, new jersey, usa'


In [4]:
#nyc, new york, usa	
def GetLocationDetail(user):
    location = user['Location'].split(',')
    user['Country'] = location[2].strip()
    user['State'] = location[1].strip()
    user['City'] = location[0].strip()
    return user
user_data = user_data.apply(GetLocationDetail, axis=1)
user_data.head()

Unnamed: 0,User-ID,Location,Age,Age_Encoding,Country,State,City
0,1,"nyc, new york, usa",,4,usa,new york,nyc
1,2,"stockton, california, usa",18.0,1,usa,california,stockton
2,3,"moscow, yukon territory, russia",,4,russia,yukon territory,moscow
3,4,"porto, v.n.gaia, portugal",17.0,0,portugal,v.n.gaia,porto
4,5,"farnborough, hants, united kingdom",,4,united kingdom,hants,farnborough


In [5]:
user_data.isnull().sum()

User-ID              0
Location             0
Age             110762
Age_Encoding         0
Country              0
State                0
City                 0
dtype: int64

In [6]:
user_data.fillna('unknown', inplace=True)
user_data.isnull().sum().sum()

0

In [7]:
book_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [8]:
book_data.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
dtype: int64

In [9]:
book_data.fillna('unknown', inplace=True)
book_data.isnull().sum().sum()

0

In [10]:
def BookPublicYear(data):
    if type(data) == str:
        if not data.isdigit():
            return 5
    year = int(data)
    if year < 1900 or year > 2019:
        return 5
    else:
        return (year - 1900) // 10
    
book_data['year'] = book_data['Year-Of-Publication'].apply(BookPublicYear)

In [11]:
book_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,year
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,10
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,10
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,9
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,9
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,9


In [12]:
#删除无用字段
book_data = book_data.drop(columns=[ 'Year-Of-Publication'])
user_data = user_data.drop(columns=['Location', 'Age'])

In [13]:
#数据合并按照userid和moiveid
df_merge = pd.merge(train_data, book_data, how='inner', on=None, left_on='ISBN', right_on='ISBN',
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)

df = pd.merge(df_merge, user_data, how='inner', on=None, left_on='User-ID', right_on='User-ID',
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)
df.fillna('unknown', inplace=True)
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Publisher,year,Age_Encoding,Country,State,City
0,8,2005018,5,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,10,4,canada,ontario,timmins
1,8,374157065,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,Farrar Straus Giroux,9,4,canada,ontario,timmins
2,8,393045218,0,The Mummies of Urumchi,E. J. W. Barber,W. W. Norton & Company,9,4,canada,ontario,timmins
3,8,399135782,0,The Kitchen God's Wife,Amy Tan,Putnam Pub Group,9,4,canada,ontario,timmins
4,8,425176428,0,What If?: The World's Foremost Military Histor...,Robert Cowley,Berkley Publishing Group,10,4,canada,ontario,timmins


In [14]:
y=df[['Book-Rating']]
x=df.drop(columns=['Book-Rating'])
x.sample(3)

Unnamed: 0,User-ID,ISBN,Book-Title,Book-Author,Publisher,year,Age_Encoding,Country,State,City
295091,112437,553586181,The Bachelor List,JANE FEATHER,Bantam,10,2,usa,louisiana,ragley
658872,247429,892831227,"A crisis of truth: The attack on faith, morali...",Ralph Martin,Servant Books,8,3,usa,wisconsin,marshfield
423345,163128,553263951,Rotation Diet,Martin Katahn,Bantam Doubleday Dell,8,4,usa,virginia,virginia beach


In [15]:
#测试集处理
dftset_merge = pd.merge(test_data, book_data, how='inner', on=None, left_on='ISBN', right_on='ISBN',
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)

df_test = pd.merge(dftset_merge, user_data, how='inner', on=None, left_on='User-ID', right_on='User-ID',
      left_index=False, right_index=False, sort=True,
      suffixes=('_x', '_y'), copy=True, indicator=False)
df_test.fillna('unknown', inplace=True)
test_x = df_test.drop(columns=['id'])
test_x.sample(3)

Unnamed: 0,User-ID,ISBN,Book-Title,Book-Author,Publisher,year,Age_Encoding,Country,State,City
40106,54222,671524313,The Girlfriends' Guide to Pregnancy,Vicki Iovine,Pocket,9,4,usa,kansas,wichita
200653,270605,345431057,Slaves in the Family (Ballantine Reader's Circle),Edward Ball,Ballantine Books,9,4,usa,illinois,bourbonnais
16511,23902,486415872,Crime and Punishment (Dover Thrift Editions),Fyodor Dostoyevsky,Dover Publications,10,4,united kingdom,england,london


In [16]:
'''
x['year_gap'] = x['year'] - x['Age_Encoding']
test_x['year_gap'] = test_x['year'] - test_x['Age_Encoding']
test_x.head()
#'''
#加了后early stopping更快，效果更差

"\nx['year_gap'] = x['year'] - x['Age_Encoding']\ntest_x['year_gap'] = test_x['year'] - test_x['Age_Encoding']\ntest_x.head()\n#"

In [17]:
#kind_features = list(set(test_x.columns) - set(['year', 'Age_Encoding', 'year_gap']))
kind_features = list(set(test_x.columns) - set(['year', 'Age_Encoding']))
cat_indexes = [list(test_x.columns).index(kind_feature) for kind_feature in kind_features]
cat_indexes

[3, 7, 4, 1, 9, 0, 8, 2]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=10)
clf = CatBoostRegressor(iterations=5000, learning_rate=0.1, od_type="Iter", l2_leaf_reg=3,depth=10 ,
                        #logging_level='Silent',
                         loss_function='MAE',
                         cat_features=cat_indexes,
                         random_seed=2406,
                        verbose=50,
                        early_stopping_rounds=300,
                        #task_type='GPU',devices='2'
                            )

    # clf.fit(x,y,eval_set=(x_test, y_test))
clf.fit(x_train, y_train, eval_set=(x_test, y_test))

predict_train = clf.predict(x_test)
prediction = clf.predict(test_x)
prediction[:4]


0:	learn: 2.5971864	test: 2.5716114	best: 2.5716114 (0)	total: 826ms	remaining: 1h 8m 50s
50:	learn: 2.2000973	test: 2.1149063	best: 2.1149063 (50)	total: 23.2s	remaining: 37m 28s
100:	learn: 2.1942361	test: 2.1107046	best: 2.1106937 (99)	total: 52.2s	remaining: 42m 10s
150:	learn: 2.1897344	test: 2.1082436	best: 2.1082436 (150)	total: 1m 20s	remaining: 42m 53s
200:	learn: 2.1848626	test: 2.1063866	best: 2.1063866 (200)	total: 1m 45s	remaining: 42m 10s
250:	learn: 2.1818962	test: 2.1058029	best: 2.1058029 (250)	total: 2m 11s	remaining: 41m 29s
300:	learn: 2.1791403	test: 2.1052506	best: 2.1052035 (292)	total: 2m 44s	remaining: 42m 50s
350:	learn: 2.1713806	test: 2.0991329	best: 2.0991329 (350)	total: 3m 24s	remaining: 45m 8s
400:	learn: 2.1668897	test: 2.0977127	best: 2.0976493 (381)	total: 4m 5s	remaining: 46m 57s
450:	learn: 2.1220466	test: 2.0347822	best: 2.0347822 (450)	total: 4m 45s	remaining: 47m 58s
500:	learn: 2.1148007	test: 2.0326859	best: 2.0326076 (496)	total: 5m 36s	remain

array([8.52174601e-07, 2.32594411e-07, 1.20321566e-06, 6.15106352e-08])

In [19]:
print(clf.best_score_)
print(clf.best_iteration_)
for feature, importance in zip(x.columns, clf.feature_importances_):
    print(feature, importance)

{'learn': {'MAE': 2.073670788308874}, 'validation': {'MAE': 2.0316437419962576}}
674
User-ID 49.405149355378235
ISBN 0.0
Book-Title 0.0
Book-Author 9.221842548741389
Publisher 9.192101535433242
year 4.414616812962976
Age_Encoding 5.463312877509501
Country 5.034216663928159
State 11.55701702007249
City 5.711743185973988


In [23]:
submission = pd.DataFrame({'id': df_test.id, 'score': prediction})
submission[['id', 'score']].to_csv('./submissions/final_final_result.csv',sep=',',index=False,header=None)
submission.head()

Unnamed: 0,id,score
0,0,8.521746e-07
1,1,2.325944e-07
2,2,1.203216e-06
3,3,6.151064e-08
4,4,4.353539


In [24]:
def normalize(x):
    if x < 0:
        return 0
    if x > 10:
        return 10
    return x
submission['score'] = submission['score'].apply(normalize)
submission.describe()

Unnamed: 0,id,score
count,206235.0,206235.0
mean,103117.0,1.943703
std,59535.060721,2.833079
min,0.0,0.0
25%,51558.5,1.266796e-07
50%,103117.0,1.749096e-06
75%,154675.5,4.237436
max,206234.0,10.0


In [25]:
submission[['id', 'score']].to_csv('./submissions/final_final_result.csv',sep=',',index=False,header=None)
submission.head()

Unnamed: 0,id,score
0,0,8.521746e-07
1,1,2.325944e-07
2,2,1.203216e-06
3,3,6.151064e-08
4,4,4.353539
