# World University Rankings

### About Dataset

###### This dataset is taken from the site https://www.kaggle.com/ for educational purposes.

https://www.kaggle.com/datasets/mylesoneill/world-university-rankings

### Loading data

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [9]:
! pip install catboost



In [10]:
from catboost import CatBoostRegressor

In [12]:
df = pd.read_csv('cwurData.csv')

In [13]:
df.head(10)

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012
5,6,Princeton University,USA,5,8,14,2,53,33,26,,101,82.5,2012
6,7,University of Oxford,United Kingdom,2,13,28,9,15,13,19,,26,82.34,2012
7,8,Yale University,USA,6,14,31,12,14,6,15,,66,79.14,2012
8,9,Columbia University,USA,7,23,21,10,13,12,14,,5,78.86,2012
9,10,"University of California, Berkeley",USA,8,16,52,6,6,5,3,,16,78.55,2012


### Train, test and validation samples

In [15]:
len(df)

2200

In [16]:
train, test = train_test_split(df,train_size=0.6,random_state=20)

In [17]:
val, test = train_test_split(test,train_size=0.5,random_state=20)

In [18]:
len(train)

1320

In [19]:
len(test)

440

In [20]:
len(val)

440

In [21]:
len(train) + len(test) + len(val)

2200

In [22]:
len(test) / len(df)

0.2

In [23]:
len(train) / len(df)

0.6

In [24]:
len(val) / len(df)

0.2

### Define features

In [26]:
train.columns

Index(['world_rank', 'institution', 'country', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year'],
      dtype='object')

In [27]:
train

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
21,22,Hebrew University of Jerusalem,Israel,2,24,93,13,101,91,101,,28,60.76,2012
1097,898,Federal University of Santa Catarina,Brazil,11,355,478,210,608,959,800,869.0,737,44.39,2014
1889,690,University of Messina,Italy,38,367,567,218,608,803,812,606.0,871,44.37,2015
71,72,Rutgers University-New Brunswick,USA,49,101,101,59,80,83,72,,85,47.50,2012
681,482,University of Akron,USA,156,355,421,210,637,660,609,596.0,121,45.18,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,353,Eindhoven University of Technology,Netherlands,11,355,315,181,296,555,310,334.0,307,45.82,2014
1607,408,University of Twente,Netherlands,12,367,511,218,343,511,368,358.0,288,45.14,2015
1814,615,Soochow University (Suzhou),China,33,367,567,218,396,773,812,565.0,871,44.49,2015
1428,229,Laval University,Canada,11,292,273,218,207,270,182,200.0,140,46.95,2015


In [28]:
X = ['world_rank', 'institution', 'country', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents', 'year']
cat_features = ['institution', 'country']
y = ['score']

In [29]:
model_1 = CatBoostRegressor(cat_features=cat_features,
                          eval_metric='MAPE',
                          random_seed=20,
                          verbose=100)

In [30]:
model_1.fit(train[X],train[y],eval_set=(val[X],val[y]))

Learning rate set to 0.05319
0:	learn: 0.0752143	test: 0.0729723	best: 0.0729723 (0)	total: 171ms	remaining: 2m 50s
100:	learn: 0.0049602	test: 0.0057793	best: 0.0057793 (100)	total: 1.64s	remaining: 14.6s
200:	learn: 0.0034658	test: 0.0046663	best: 0.0046663 (200)	total: 2.98s	remaining: 11.8s
300:	learn: 0.0025639	test: 0.0041201	best: 0.0041199 (299)	total: 4.4s	remaining: 10.2s
400:	learn: 0.0019830	test: 0.0037819	best: 0.0037799 (399)	total: 5.87s	remaining: 8.76s
500:	learn: 0.0015692	test: 0.0035622	best: 0.0035622 (500)	total: 7.32s	remaining: 7.29s
600:	learn: 0.0012848	test: 0.0034232	best: 0.0034229 (599)	total: 8.79s	remaining: 5.84s
700:	learn: 0.0010918	test: 0.0033366	best: 0.0033366 (700)	total: 10.3s	remaining: 4.39s
800:	learn: 0.0009404	test: 0.0032673	best: 0.0032673 (800)	total: 11.7s	remaining: 2.92s
900:	learn: 0.0008306	test: 0.0032173	best: 0.0032173 (900)	total: 13.2s	remaining: 1.45s
999:	learn: 0.0007389	test: 0.0031782	best: 0.0031782 (999)	total: 14.7s	re

<catboost.core.CatBoostRegressor at 0x2019c8e3110>

In [31]:
model_1.predict(test[X])

array([57.70271806, 44.07800285, 51.39143965, 51.79788235, 49.19958912,
       44.39777386, 49.78025337, 44.71279381, 45.61541953, 44.41041837,
       44.16594959, 44.51712701, 44.65262788, 44.59277858, 45.07019566,
       44.43951175, 44.20121679, 47.88607659, 44.71670788, 45.34110424,
       44.28461897, 46.69230925, 50.73329139, 44.63492769, 44.33051659,
       44.45622158, 45.0225809 , 44.13095393, 47.61730923, 45.73864875,
       54.25656321, 44.19647292, 44.79571167, 62.5573813 , 50.38414206,
       83.59769579, 44.08598055, 45.97133493, 44.92222351, 47.16574352,
       44.43826485, 47.61442945, 45.08389737, 47.62995392, 44.45573691,
       55.78988142, 48.83575077, 44.39001616, 44.22660895, 49.53638459,
       44.72093348, 44.45316159, 44.33455404, 45.30464811, 48.61633884,
       46.82200041, 49.71667311, 45.55028557, 45.33532985, 44.16292185,
       50.48114095, 44.18834568, 44.36982429, 45.8515935 , 44.59886496,
       65.11997406, 45.08418886, 44.95745487, 49.42947749, 44.48

In [32]:
test['score_pred'] = model_1.predict(test[X])

In [33]:
test

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year,score_pred
1239,40,University of North Carolina at Chapel Hill,USA,25,124,69,86,32,32,19,21.0,41,57.09,2015,57.702718
2192,993,Southwest Jiaotong University,China,82,367,327,218,937,962,812,998.0,861,44.03,2015,44.078003
42,43,Carnegie Mellon University,USA,31,30,81,26,101,101,61,,101,51.60,2012,51.391440
1290,91,Swiss Federal Institute of Technology in Lausanne,Switzerland,2,367,74,116,124,114,39,110.0,58,51.47,2015,51.797882
1339,140,University of Nottingham,United Kingdom,10,131,109,84,116,176,146,140.0,138,49.02,2015,49.199589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,93,Swiss Federal Institute of Technology in Lausanne,Switzerland,4,355,130,80,126,121,48,124.0,41,51.80,2014,51.802875
1850,651,Ehime University,Japan,40,367,514,218,619,582,511,590.0,418,44.43,2015,44.437060
580,381,Hong Kong Polytechnic University,Hong Kong,5,355,189,210,261,764,406,395.0,481,45.65,2014,45.626619
1389,190,University of South Florida - Tampa,USA,83,367,418,218,233,251,197,164.0,70,47.66,2015,47.746992


In [34]:
def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

In [35]:
error(test['score'],test['score_pred'])

0.19002856115163144
0.003198461149147535


### Уменьшим learning rate

In [37]:
model_2 = CatBoostRegressor(cat_features=cat_features,
                          learning_rate = 0.03,
                          eval_metric='MAPE',
                          random_seed=20,
                          verbose=100)

In [38]:
model_2.fit(train[X],train[y],eval_set=(val[X],val[y]))

0:	learn: 0.0766332	test: 0.0745126	best: 0.0745126 (0)	total: 14ms	remaining: 14s
100:	learn: 0.0100460	test: 0.0102707	best: 0.0102707 (100)	total: 1.47s	remaining: 13.1s
200:	learn: 0.0041674	test: 0.0051448	best: 0.0051448 (200)	total: 2.92s	remaining: 11.6s
300:	learn: 0.0034498	test: 0.0045526	best: 0.0045526 (300)	total: 4.18s	remaining: 9.72s
400:	learn: 0.0028434	test: 0.0040831	best: 0.0040831 (400)	total: 5.6s	remaining: 8.37s
500:	learn: 0.0024779	test: 0.0038493	best: 0.0038484 (496)	total: 6.99s	remaining: 6.96s
600:	learn: 0.0021617	test: 0.0036459	best: 0.0036459 (600)	total: 8.41s	remaining: 5.58s
700:	learn: 0.0018785	test: 0.0034452	best: 0.0034452 (700)	total: 9.82s	remaining: 4.19s
800:	learn: 0.0016660	test: 0.0032944	best: 0.0032944 (800)	total: 11.3s	remaining: 2.8s
900:	learn: 0.0014715	test: 0.0031728	best: 0.0031728 (900)	total: 12.7s	remaining: 1.39s
999:	learn: 0.0013099	test: 0.0030774	best: 0.0030774 (999)	total: 14.1s	remaining: 0us

bestTest = 0.0030774

<catboost.core.CatBoostRegressor at 0x2019c867aa0>

### Увеличим learning rate

In [40]:
parameters = {'cat_features': cat_features,
              'eval_metric': 'MAPE',
              'learning_rate': 0.1,
              'random_seed':20,
              'verbose':100}

In [41]:
model_3 = CatBoostRegressor(**parameters)

In [42]:
model_3.fit(train[X],train[y],eval_set=(val[X],val[y]))

0:	learn: 0.0723764	test: 0.0698988	best: 0.0698988 (0)	total: 15.8ms	remaining: 15.8s
100:	learn: 0.0041848	test: 0.0055762	best: 0.0055762 (100)	total: 1.45s	remaining: 12.9s
200:	learn: 0.0024822	test: 0.0043134	best: 0.0043134 (200)	total: 2.86s	remaining: 11.4s
300:	learn: 0.0016843	test: 0.0038440	best: 0.0038440 (300)	total: 4.34s	remaining: 10.1s
400:	learn: 0.0012008	test: 0.0035856	best: 0.0035856 (400)	total: 5.77s	remaining: 8.62s
500:	learn: 0.0009471	test: 0.0034558	best: 0.0034558 (500)	total: 7.2s	remaining: 7.17s
600:	learn: 0.0007454	test: 0.0033789	best: 0.0033771 (597)	total: 8.61s	remaining: 5.72s
700:	learn: 0.0006186	test: 0.0033399	best: 0.0033399 (700)	total: 10.1s	remaining: 4.3s
800:	learn: 0.0005255	test: 0.0033131	best: 0.0033119 (796)	total: 11.5s	remaining: 2.86s
900:	learn: 0.0004542	test: 0.0032896	best: 0.0032896 (900)	total: 13s	remaining: 1.43s
999:	learn: 0.0003976	test: 0.0032711	best: 0.0032711 (999)	total: 14.5s	remaining: 0us

bestTest = 0.00327

<catboost.core.CatBoostRegressor at 0x2019c865520>

In [43]:
model_3.predict(test[X])

array([ 57.72371608,  44.06359362,  51.81420228,  51.51579764,
        49.12444523,  44.41748501,  49.4909818 ,  44.69078814,
        45.44511458,  44.39032083,  44.14775996,  44.50293011,
        44.60926339,  44.59603755,  45.17342651,  44.47110881,
        44.21563767,  47.88927271,  44.65685159,  45.34132191,
        44.26641365,  46.81783418,  51.49024729,  44.6461291 ,
        44.35799819,  44.34667163,  44.94549791,  44.10153825,
        47.87638157,  45.81579918,  53.94263584,  44.18226527,
        44.74586703,  63.22021697,  50.42613351,  81.84126184,
        44.06454904,  45.74408601,  45.01521542,  47.45701105,
        44.5181911 ,  47.68312268,  45.14349695,  47.77311713,
        44.34670285,  55.27403397,  49.2540537 ,  44.39983482,
        44.27135007,  49.62199557,  44.74220835,  44.44282782,
        44.32896526,  45.20773591,  48.62545163,  46.91390968,
        50.41531659,  45.72250531,  45.23934632,  44.15166717,
        50.09014781,  44.10340833,  44.36757552,  45.80

In [44]:
test['score_pred_2'] = model_3.predict(test[X])

In [45]:
test

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year,score_pred,score_pred_2
1239,40,University of North Carolina at Chapel Hill,USA,25,124,69,86,32,32,19,21.0,41,57.09,2015,57.702718,57.723716
2192,993,Southwest Jiaotong University,China,82,367,327,218,937,962,812,998.0,861,44.03,2015,44.078003,44.063594
42,43,Carnegie Mellon University,USA,31,30,81,26,101,101,61,,101,51.60,2012,51.391440,51.814202
1290,91,Swiss Federal Institute of Technology in Lausanne,Switzerland,2,367,74,116,124,114,39,110.0,58,51.47,2015,51.797882,51.515798
1339,140,University of Nottingham,United Kingdom,10,131,109,84,116,176,146,140.0,138,49.02,2015,49.199589,49.124445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,93,Swiss Federal Institute of Technology in Lausanne,Switzerland,4,355,130,80,126,121,48,124.0,41,51.80,2014,51.802875,51.732508
1850,651,Ehime University,Japan,40,367,514,218,619,582,511,590.0,418,44.43,2015,44.437060,44.430617
580,381,Hong Kong Polytechnic University,Hong Kong,5,355,189,210,261,764,406,395.0,481,45.65,2014,45.626619,45.605327
1389,190,University of South Florida - Tampa,USA,83,367,418,218,233,251,197,164.0,70,47.66,2015,47.746992,47.748199


In [46]:
error(test['score'],test['score_pred'])

0.19002856115163144
0.003198461149147535


In [47]:
error(test['score'],test['score_pred_2'])

0.20952891944627522
0.0035709067926039126
