# Create `fundingMLP.model`
- If you already have the .model file, you can skip this Jupyter Notebook.

In [None]:
import pandas as pd
import json
from os import listdir
from os.path import isfile, join
from math import ceil

In [None]:
path = 'sorted_papers/'
fileNames = [f for f in listdir(path) if isfile(join(path, f))]

In [None]:
dic = {}
print('total:', len(fileNames))
for i in range(len(fileNames)):
    print('creating', i)
    with open(path+fileNames[i], 'r', encoding="utf8") as file:
        data = json.load(file)
        id = data['eid']
        quatile = ceil(int(data['cover-date'][5:7]) / 3)
        fund = data['funding']
        field = {'authorCount': len(data['authors']), 'refCount': data['ref-count'], 'citedByCount': data['citedby-count'], 'quatile':quatile, '10': 0, '11': 0, '12': 0, '13': 0, '14': 0, '15': 0, '16': 0, '17': 0, '18': 0, '19': 0, '20': 0, '21': 0, '22': 0, '23': 0, '24': 0, '25': 0, '26': 0, '27': 0, '28': 0, '29': 0, '30': 0, '31': 0, '32': 0, '33': 0, '34': 0, '35': 0, '36': 0, 'funded': False}
        dic[id] = field
        for i in data['subject-areas']:
            code = i['@code'][:2]
            if dic[id][code] == 0:
                if fund != []:
                    dic[id]['funded'] = True
                dic[id][code] += 1

df = pd.DataFrame.from_dict(dic, orient='index')

display(df)

In [None]:
df.to_csv('./predictFundingDf.csv')

In [None]:
# in case already have predictFundingDf.csv
df = pd.read_csv('./predictFundingDf.csv')
df.set_index('Unnamed: 0', inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report

x = df.drop(['funded'], axis=1)
y = df['funded']

x_train, x_test, y_train, y_test = train_test_split(x,y, stratify=y, test_size = 0.2, random_state= 33799)

print(y_test)

Unnamed: 0
2-s2.0-85146754852     True
2-s2.0-85045872589    False
2-s2.0-85051658859    False
2-s2.0-84992217873     True
2-s2.0-85087089727    False
                      ...  
2-s2.0-85006952323     True
2-s2.0-84927174037    False
2-s2.0-85089272667     True
2-s2.0-85007005033     True
2-s2.0-85016087590     True
Name: funded, Length: 5841, dtype: bool


In [None]:
grid_search = GridSearchCV(
    estimator=MLPClassifier(),
    param_grid=dict(
        hidden_layer_sizes=[(100,75,),(100,75,50),(150,),(100,100)],
        activation=['logistic'],
        batch_size=[256,512],
        random_state=[33799],
        alpha=[0.0001,0.0005, 0.001,0.002]
    ),

    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_search.fit(x_train, y_train)
model = grid_search.best_estimator_
print(grid_search.best_params_)
y_pred = model.predict(x_test)
print(confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[True,False]))
print(classification_report(y_test, y_pred, digits=4))

{'activation': 'logistic', 'alpha': 0.001, 'batch_size': 256, 'hidden_layer_sizes': (150,), 'random_state': 33799}
[[3022  549]
 [ 929 1341]]
              precision    recall  f1-score   support

       False     0.7095    0.5907    0.6447      2270
        True     0.7649    0.8463    0.8035      3571

    accuracy                         0.7470      5841
   macro avg     0.7372    0.7185    0.7241      5841
weighted avg     0.7434    0.7470    0.7418      5841





In [None]:
import pickle
with open('./fundingMLP.model','wb') as file:
    pickle.dump(model, file)

After adding scraped dataset, fundingMLP1.model is:

{'activation': 'logistic', 'alpha': 0.001, 'batch_size': 256, 'hidden_layer_sizes': (150,), 'random_state': 33799}
```
[[3022  549]
 [ 929 1341]]
              precision    recall  f1-score   support

       False     0.7095    0.5907    0.6447      2270
        True     0.7649    0.8463    0.8035      3571

    accuracy                         0.7470      5841
   macro avg     0.7372    0.7185    0.7241      5841
weighted avg     0.7434    0.7470    0.7418      5841
```

{'activation': 'logistic', 'alpha': 0.0005, 'batch_size': 512, 'hidden_layer_sizes': (100, 75), 'random_state': 33799}
```
[[2525  260]
 [ 717  541]]
              precision    recall  f1-score   support

       False     0.6754    0.4300    0.5255      1258
        True     0.7788    0.9066    0.8379      2785

    accuracy                         0.7583      4043
   macro avg     0.7271    0.6683    0.6817      4043
weighted avg     0.7467    0.7583    0.7407      4043
```

fundingMLP.model
{'activation': 'logistic', 'alpha': 0.0001, 'batch_size': 512, 'hidden_layer_sizes': (100, 50, 25), 'random_state': 33799}
```
[[2480  305]
 [ 675  583]]
              precision    recall  f1-score   support

       False     0.6565    0.4634    0.5433      1258
        True     0.7861    0.8905    0.8350      2785

    accuracy                         0.7576      4043
   macro avg     0.7213    0.6770    0.6892      4043
weighted avg     0.7458    0.7576    0.7443      4043
```

{'activation': 'logistic', 'alpha': 0.002, 'batch_size': 512, 'hidden_layer_sizes': (100, 75, 50), 'random_state': 33799}
```
[[2496  289]
 [ 690  568]]
              precision    recall  f1-score   support

       False     0.6628    0.4515    0.5371      1258
        True     0.7834    0.8962    0.8360      2785

    accuracy                         0.7579      4043
   macro avg     0.7231    0.6739    0.6866      4043
weighted avg     0.7459    0.7579    0.7430      4043
```

{'activation': 'logistic', 'alpha': 0.002, 'batch_size': 256, 'hidden_layer_sizes': (150,), 'random_state': 33799}
```
[[2500  285]
 [ 710  548]]
              precision    recall  f1-score   support

       False     0.6579    0.4356    0.5242      1258
        True     0.7788    0.8977    0.8340      2785

    accuracy                         0.7539      4043
   macro avg     0.7183    0.6666    0.6791      4043
weighted avg     0.7412    0.7539    0.7376      4043
```

scoring f1
{'activation': 'logistic', 'alpha': 0.002, 'batch_size': 512, 'hidden_layer_sizes': (100, 100), 'random_state': 33799}
```
[[2530  255]
 [ 748  510]]
              precision    recall  f1-score   support

       False     0.6667    0.4054    0.5042      1258
        True     0.7718    0.9084    0.8346      2785

    accuracy                         0.7519      4043
   macro avg     0.7192    0.6569    0.6694      4043
weighted avg     0.7391    0.7519    0.7318      4043
```

{'alpha': 0.002, 'hidden_layer_sizes': (150,), 'random_state': 33799}
```
[[2374  411]
 [ 623  635]]
              precision    recall  f1-score   support

       False     0.6071    0.5048    0.5512      1258
        True     0.7921    0.8524    0.8212      2785

    accuracy                         0.7442      4043
   macro avg     0.6996    0.6786    0.6862      4043
weighted avg     0.7345    0.7442    0.7372      4043
```

{'alpha': 0.002, 'hidden_layer_sizes': (150, 150), 'random_state': 33799}
```
[[2420  365]
 [ 702  556]]
              precision    recall  f1-score   support

       False     0.6037    0.4420    0.5103      1258
        True     0.7751    0.8689    0.8194      2785

    accuracy                         0.7361      4043
   macro avg     0.6894    0.6555    0.6648      4043
weighted avg     0.7218    0.7361    0.7232      4043
```

{'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'random_state': 33799}
```
[[2495  290]
 [ 725  533]]
              precision    recall  f1-score   support

       False     0.6476    0.4237    0.5123      1258
        True     0.7748    0.8959    0.8310      2785

    accuracy                         0.7489      4043
   macro avg     0.7112    0.6598    0.6716      4043
weighted avg     0.7353    0.7489    0.7318      4043
```