#### Open a TPU session and mount your drive to Google Colab

In [None]:
%tensorflow_version 1.x
import os
import pprint
import json
import tensorflow as tf

assert "COLAB_TPU_ADDR" in os.environ, "ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!"
TPU_ADDRESS = "grpc://" + os.environ["COLAB_TPU_ADDR"] 
TPU_TOPOLOGY = "2x2"
print("TPU address is", TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU. 
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.

In [None]:

from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/your_working_dir')


#### Download tensorflow checkpoint from S3 location provided
+ MathBERT-basevocab-uncased model artifacts are stored in s3 bucket and can be downloaded using 'wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/bert_config.json'
+ MathBERT-mathvocab-uncased model artifacts are stored in s3 bucket and can be downloaded using 'wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-mathvocab-uncased/bert_config.json'

In [None]:
!mkdir your_model_folder
%cd your_model_folder

In [None]:
!wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/bert_config.json
!wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/vocab.txt
!wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/bert_model.ckpt.index
!wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/bert_model.ckpt.meta
!wget http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/bert_model.ckpt.data-00000-of-00001

--2021-05-20 02:47:06--  http://tracy-nlp-models.s3.amazonaws.com/mathbert-basevocab-uncased/bert_config.json
Resolving tracy-nlp-models.s3.amazonaws.com (tracy-nlp-models.s3.amazonaws.com)... 52.216.101.115
Connecting to tracy-nlp-models.s3.amazonaws.com (tracy-nlp-models.s3.amazonaws.com)|52.216.101.115|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 570 [application/json]
Saving to: ‘bert_config.json’


2021-05-20 02:47:06 (78.6 MB/s) - ‘bert_config.json’ saved [570/570]



#### Download MathBERT training scripts
+ git clone the MathBERT repo at https://github.com/tbs17/MathBERT/


In [None]:
%cd ../ #clone the repo outside of your model folder
!git clone https://github.com/tbs17/MathBERT.git

#### Start Fine-tuning
+ prepare your dataset to be compatible with BERT training data format. Please refer to the details at https://github.com/google-research/bert. 
+ Alternatively, you can use our function 'split_3data_label()' to split your data into 3 parts and save your label data

+ using the below script to utilize the MathBERT artifact for fine-tuning
+ for the 3 downstream tasks we had, we used the below task name:
    + skill code prediction: 'COLA_385'
    + auto-grading classification: 'auto_grade'
    + knowledge tracing classificationsingle: 'KT'
 + depending on your number of labels, just change the line accordingly in the run_classifier.py
    ```
        def get_labels(self):
            return [str(x) for x in range(your_num_labels)]
    ```

In [None]:
def split_3data_label(org_path,out_dir):
    import pandas as pd
    import os
    from sklearn.model_selection import train_test_split
    data=pd.read_csv(org_path,encoding='utf-8',names=['text','label'],header=0)
    
    print(f'total sample is {data.shape[0]}')
    df_train, df_test=train_test_split(data,test_size=0.2,random_state=111)

    df_bert_train, df_bert_dev = train_test_split(df_train, test_size=0.1,random_state=111)
    #create new dataframe for test data

    #output tsv file, no header for train and dev
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    df_bert_train.to_csv('{}/train_with_label.csv'.format(out_dir), index=False)
    df_bert_dev.to_csv('{}/dev_with_label.csv'.format(out_dir),index=False)
    df_test.to_csv('{}/test_with_label.csv'.format(out_dir), index=False)
    print(f'training samples are {df_bert_train.shape[0]}\n'
        f'eval samples are {df_bert_dev.shape[0]}\n'
        f'testing samples are {df_test.shape[0]}'
        )
#     print('{} unique labels'.format(data0['label_en'].nunique()))

In [None]:
# Please find the full list of tasks and their fintuning hyperparameters
# here https://github.com/google-research/albert/blob/master/run_glue.sh

BUCKET = "your_google_cloud_bucket" #@param { type: "string" }
TASK = 'auto_grade' #@param {type:"string"}

BASE_DIR = "gs://" + BUCKET
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BUCKET.")
DATA_DIR = os.path.join(BASE_DIR, "data")

OUTPUT_DIR = 'gs://{}/BERT/mathBERT/{}_{}'.format(BUCKET, TASK,'MathBERT_LR2E-5_BS64_MS512_EP5_baseVocab_testing')
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))



In [None]:
import time
start=time.time()
os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR
!python bert/run_classifier.py \
  --data_dir=mathBERT-downstreamTasks/auto_grade/ \
  --bert_config_file=Upload_Models/MathBERT-orig/bert_config.json \
  --vocab_file=Upload_Models/MathBERT-orig/vocab.txt \
  --task_name=$TASK \
  --output_dir=$OUTPUT_DIR \
  --init_checkpoint='gs://your_bucket/MathBERT-basevocab-uncased/bert_model.ckpt' \
  --do_lower_case=True \
  --do_train=True \
  --do_eval=True \
  --do_predict=True \
  --max_seq_length=512 \
  --warmup_step=1000 \
  --learning_rate=2e-5 \
  --num_train_epochs=5 \
  --save_checkpoints_steps=2000 \
  --train_batch_size=64 \
  --eval_batch_size=32 \
  --predict_batch_size=16 \
  --tpu_name=$TPU_ADDRESS \
  --use_tpu=True
end=time.time()
print(f'it took {(end-start)/60} mins to finish ')


+ Downloading the prediction result and eval result to your local folder

In [None]:
import os
folder_name=OUTPUT_DIR.split('/')[5]
os.environ['OUTPUT_DIR']=OUTPUT_DIR
os.environ['folder_name']=folder_name
!gsutil cp $OUTPUT_DIR/eval_results.txt your_output_folder/$folder_name-eval_results.txt
!gsutil cp $OUTPUT_DIR/test_results.tsv your_output_folder/$folder_name-test_results.tsv

#### Convert Testing result 
+ the prediction output need to be formated so that you can evaluate 
+ below examples are for predicting the auto-grading and KC tasks with 5 labels and 385 labels in the dataset

In [None]:
def match_top3_f1_acc(data_dir,label_Path,out_dir):
    from datetime import datetime
    import numpy as np
    import pandas as pd
    import os
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    label_data=pd.read_csv(label_Path,names=['label'],header=0)
    print(f'test data shape{label_data.shape}')
    for i, file in enumerate(os.listdir(data_dir)):
    
        pred_dataPath=os.path.join(data_dir,file)
        print('======')
        print(i,pred_dataPath)
        
        if pred_dataPath.endswith('.csv'):

            pred_data=pd.read_csv(pred_dataPath,encoding='ISO-8859-1')
            print(f'predicted data shape: {pred_data.shape}')
            print('total {} classes are predicted as top class'.format(len(pred_data['top1'].unique())))
            match=pd.concat([pred_data,label_data],axis=1)
            print(f'merged data shape{match.shape}')
            correct_top1=match[match['top1']==match['label']].top1.unique()
            correct_top2=match[match['top2']==match['label']].top2.unique()
            correct_top3=match[match['top3']==match['label']].top3.unique()
            correct_all=list(set(list(correct_top1)+list(correct_top2)+list(correct_top3)))
            print('Correct top 1 label {}'.format(len(correct_top1)))
            print('Correct top 2 label {}'.format(len(correct_top2)))
            print('Correct top 3 label {}'.format(len(correct_top3)))
            print('Total top 3 Correct labels {}'.format(len(correct_all)))
            match['matched1']=np.where(match['top1']==match['label'],1,0)
            match['matched2']=np.where((match['top1']==match['label']) | (match['top2']==match['label']),1,0)
            match['matched3']=np.where((match['top1']==match['label']) | (match['top2']==match['label']) | (match['top3']==match['label']),1,0)
       
            marker=file.split('.')[0]
            from pathlib import Path
            out_dir=Path(out_dir)
            out_dir.mkdir(exist_ok=True)
            match.to_csv('{}/{}_matched.csv'.format(out_dir,marker),index=False)

            
            print('---Below is F1 Score(weighted)--')
            top1_f1=round(f1_score(match['label'], match['top1'], average='weighted')*100,3)
            print('Top1 label F1 score(weighted): {}%'.format(top1_f1))
            top2_f1=round(f1_score(match['label'], match['top2'], average='weighted')*100,3)
            print('Top1 label F1 score(weighted): {}%'.format(top2_f1+top1_f1))
            top3_f1=round(f1_score(match['label'], match['top3'], average='weighted')*100,3)
            print('Top1 label F1 score(weighted): {}%'.format(top3_f1+top2_f1+top1_f1))
            
            print ('---Below is Sklearn accuracy---')
            top1_accuracy=round(accuracy_score(match['label'], match['top1'])*100,3)
            print('Top1 label accuracy score: {}%'.format(top1_accuracy))
            top2_accuracy=round(accuracy_score(match['label'], match['top2'])*100,3)
            print('Top1 label accuracy score: {}%'.format(top2_accuracy+top1_accuracy))
            top3_accuracy=round(accuracy_score(match['label'], match['top3'])*100,3)
            print('Top1 label accuracy score: {}%'.format(top3_accuracy+top2_accuracy+top1_accuracy))


def convert_autoGrade(orig_testPath,pred_testPath,org_dataPath,out_dir):
    from pathlib import Path
    import pandas as pd
    #read the original test data for the text and id
    df_test = pd.read_csv(orig_testPath, sep='\t')#,engine='python'
    df_test['guid']=df_test.iloc[:,0].astype(str)
    print(f'original test file has shape {df_test.shape}')
    #read the results data for the probabilities
    df_result = pd.read_csv(pred_testPath, sep='\t', header=None)
    print(f'predicted test file has shape {df_result.shape}')
    out_dir=Path(out_dir)
    Path.mkdir(out_dir,exist_ok=True)
    import numpy as np
    # df_map
    df_map_result = pd.DataFrame({'guid': df_test['guid'],
        'question': df_test['question'],
        'answer': df_test['answer'],
        'top1': df_result.idxmax(axis=1),
        'top1_probability':df_result.max(axis=1),
        'top2': df_result.columns[df_result.values.argsort(1)[:,-2]],
        'top2_probability':df_result.apply(lambda x: sorted(x)[-2],axis=1),
        'top3': df_result.columns[df_result.values.argsort(1)[:,-3]],
        'top3_probability':df_result.apply(lambda x: sorted(x)[-3],axis=1),
        'top4': df_result.columns[df_result.values.argsort(1)[:,-4]],
        'top4_probability':df_result.apply(lambda x: sorted(x)[-4],axis=1),
        'top5': df_result.columns[df_result.values.argsort(1)[:,-5]],
        'top5_probability':df_result.apply(lambda x: sorted(x)[-5],axis=1)
        })
    #view sample rows of the newly created dataframe
#     display(df_map_result.head())
    df_map_result['top1']=df_map_result['top1'].astype(str)
    df_map_result['top2']=df_map_result['top2'].astype(str)
    df_map_result['top3']=df_map_result['top3'].astype(str)
    df_map_result.dtypes
    df_map_result['top4']=df_map_result['top4'].astype(str)
    df_map_result['top5']=df_map_result['top5'].astype(str)
    print(f'mapped test file has shape {df_map_result.shape}')

    label_map_dict={'0':1,'1':2,'2':3,'3':4,'4':5}
    marker=pred_testPath.split('/')[-1].split('.')[0]
    df_map_result=df_map_result.replace({'top1':label_map_dict,'top2':label_map_dict,'top3':label_map_dict,'top4':label_map_dict,'top5':label_map_dict})
    df_map_result.to_csv('{}/{}_converted.csv'.format(out_dir,marker),index=False)
    print(df_map_result.shape)#(702, 12)
    return df_map_result

In [None]:
orig_testPath='auto_grade/test.tsv'#original test data
pred_testPath='output/auto_grade_MathBERT_LR2E-5_BS64_MS512_EP5_baseVocab_testing-test_results.tsv' #your own predicted test data
org_dataPath='auto_grade/auto_grade_original_full_data.csv'#the original full set of auto grade data
out_dir='TEST_converted_autoGrade_MATHBERT'# name your own output dir
df_map_result=convert_autoGrade(orig_testPath,pred_testPath,org_dataPath,out_dir)
df_map_result.head()

In [None]:
orig_testPath='TAPT/skill_code_DESC_TITLE_PROBLEM/CoLA/test.tsv'#original test data
pred_testPath='EVAL_RESULT/MathBERT/COLA_385_MathBERT-TAPT_LR5E-5_BS64_MS512_EP25_customVocab-V2_FIT_SEED5_V2-test_results.tsv'#your own predicted test data
org_dataPath='further-pre-training/CORPUS/ALL_GRADES/DESC_TITLE_PROBLEM_combined.csv'#the original full set 
out_dir='TEST_converted_DESC_TITLE_PROB_MATHBERT'# name your own output dir
df_map_result=convert_test_result(orig_testPath,pred_testPath,org_dataPath,out_dir)
df_map_result.head()

#### Evaluate for F1, Accuracy and AUC

In [None]:
def match_top3_f1_acc_autoGrade(data_dir,label_Path,out_dir):
    from datetime import datetime
    import numpy as np
    import pandas as pd
    import os
    from sklearn.metrics import roc_curve, accuracy_score
    from sklearn.metrics import f1_score, roc_auc_score
    label_data=pd.read_csv(label_Path,names=['Index','label'],header=0)
    label_data_nna=label_data.dropna()
    
    print(f'test data shape{label_data.shape}')
    print(f'test data shape after dropping na {label_data_nna.shape}')
    for i, file in enumerate(os.listdir(data_dir)):
    
        pred_dataPath=os.path.join(data_dir,file)
        print('======')
        print(i, pred_dataPath)

        if pred_dataPath.endswith('.csv'):
        
            pred_data=pd.read_csv(pred_dataPath,encoding='ISO-8859-1')
            print(f'predicted data shape: {pred_data.shape}')
            print('total {} classes are predicted as top class'.format(len(pred_data['top1'].unique())))
            match=pd.concat([pred_data,label_data_nna],axis=1)
            print(f'merged data shape{match.shape}')
            correct_top1=match[match['top1']==match['label']].top1.unique()
            correct_top2=match[match['top2']==match['label']].top2.unique()
            correct_top3=match[match['top3']==match['label']].top3.unique()
            correct_all=list(set(list(correct_top1)+list(correct_top2)+list(correct_top3)))
            print('Correct top 1 label {}'.format(len(correct_top1)))
            print('Correct top 2 label {}'.format(len(correct_top2)))
            print('Correct top 3 label {}'.format(len(correct_top3)))
            print('Total top 3 Correct labels {}'.format(len(correct_all)))
            match['matched1']=np.where(match['top1']==match['label'],1,0)
            match['matched2']=np.where((match['top1']==match['label']) | (match['top2']==match['label']),1,0)
            match['matched3']=np.where((match['top1']==match['label']) | (match['top2']==match['label']) | (match['top3']==match['label']),1,0)
       
            marker=file.split('.')[0]
            from pathlib import Path
            out_dir=Path(out_dir)
            out_dir.mkdir(exist_ok=True)
            match.to_csv('{}/{}_matched.csv'.format(out_dir,marker),index=False)
            # display(match.head())
            
            print('---Below is F1 Score(weighted)--')
            top1_f1=round(f1_score(match['label'], match['top1'], average='weighted')*100,3)
            print('Top1 label F1 score(weighted): {}%'.format(top1_f1))
            top2_f1=round(f1_score(match['label'], match['top2'], average='weighted')*100,3)
            print('Top1 label F1 score(weighted): {}%'.format(top2_f1+top1_f1))
            top3_f1=round(f1_score(match['label'], match['top3'], average='weighted')*100,3)
            print('Top1 label F1 score(weighted): {}%'.format(top3_f1+top2_f1+top1_f1))
            print ('---Below is Sklearn accuracy---')
            top1_accuracy=round(accuracy_score(match['label'], match['top1'])*100,3)
            print('Top1 label accuracy score: {}%'.format(top1_accuracy))
            top2_accuracy=round(accuracy_score(match['label'], match['top2'])*100,3)
            print('Top1 label accuracy score: {}%'.format(top2_accuracy+top1_accuracy))
            top3_accuracy=round(accuracy_score(match['label'], match['top3'])*100,3)
            print('Top1 label accuracy score: {}%'.format(top3_accuracy+top2_accuracy+top1_accuracy))

In [None]:
def convert_auc_autoGrade(orig_testPath,pred_testDir,label_Path,org_dataPath,out_dir):
    from pathlib import Path
    import pandas as pd
    from sklearn.metrics import roc_auc_score
    import numpy as np
    import os
    #read the original test data for the text and id
    df_test = pd.read_csv(orig_testPath, sep='\t',engine='python')
    df_test['guid']=df_test.iloc[:,0].astype(str)
    print(f'original test file has shape {df_test.shape}')
    print('----')
    out_dir=Path(out_dir)
    Path.mkdir(out_dir,exist_ok=True)
    #read the results data for the probabilities
    for i, f in enumerate(os.listdir(pred_testDir)):
        if f.endswith('test_results.tsv'):
            print(i, f)
        
            df_result = pd.read_csv(os.path.join(pred_testDir,f), sep='\t', header=None)
            print(f'predicted test file has shape {df_result.shape}')
            label_data=pd.read_csv(label_Path,usecols=['label'])
            auc=round(roc_auc_score(label_data['label'], df_result,multi_class='ovo',average='weighted')*100,3)
            print(f'average auc for 5 classes is {auc} %!')
            print('----')

In [None]:
data_dir='TEST_converted_autoGrade_MATHBERT'
label_Path='auto_grade/test_labels.csv'
out_dir='AUTO_GRADE_matched_MATHBERT'
match_top3_f1_acc_autoGrade(data_dir,label_Path,out_dir)

In [None]:
orig_testPath='auto_grade/test.tsv'
pred_testDir='MathBERT/AUTO-GRADE/'
org_dataPath='auto_grade/auto_grade_original_full_data.csv'
label_Path='auto_grade/test_labels.csv'
out_dir='TEST_converted_autoGrade_MATHBERT'
convert_auc_autoGrade(orig_testPath,pred_testDir,label_Path,org_dataPath,out_dir)


In [None]:
orig_testPath='TAPT/skill_code_DESC_TITLE_PROBLEM/CoLA/test.tsv'
pred_testDir='EVAL_RESULT/MathBERT/skillCode_TAPT/'
org_dataPath='further-pre-training/CORPUS/ALL_GRADES/DESC_TITLE_PROBLEM_combined.csv'
label_Path='TAPT/skill_code_DESC_TITLE_PROBLEM/CoLA/test_labels.csv'
out_dir='TEST_converted_DESC_TITLE_PROB_MATHTAPT_V2'
convert_auc_skillCode(orig_testPath,pred_testDir,label_Path,org_dataPath,out_dir)
# df_map_result.head()