> ### Imports

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from iqual import nlpmodel, evaluation, crossval

> ### Load `annotated (human-coded)` and `unannotated` datasets

In [2]:
data_dir         = "../data"
human_coded_df   = pd.read_csv(os.path.join(data_dir,"annotated.csv"))
uncoded_df       = pd.read_csv(os.path.join(data_dir,"unannotated.csv"))

> ### Configure training data

In [3]:
### Select Question and Answer Columns
question_col = 'Q_en'
answer_col   = 'A_en'

### Select a code
code_variables = ['religious', 'secular', 'no_ambition', 'vague_job',
                  'job_secular','vocational_training', 'entrepreneur',
                  'education_low','education_neutral', 'education_high',
                  'education_religious', 'marriage', 'migration',
                  'vague_non_specific', 'reliance_on_god', 'ability_high',
                  'ability_low', 'budget_high', 'budget_low',
                  'awareness_information_high','awareness_information_low',
                  'camp_regulations', 'covid_impacts', 'public_assistance',
                  'worries_anxieties',
                 ]

In [4]:
# Scorig Dict for evaluation
scoring_dict = evaluation.get_scoring_dict(['f1'])

> ### Configure a Hyperparameter Grid for cross-validation + fitting

In [6]:
## Paths for precomputed vectors created using `sentence-transformers`
dict_dir          = "../dictionaries"

### Sentence-Transformers precomputed vectors
sbert_models      = ["all-mpnet-base-v2","distiluse-base-multilingual-cased-v2"]
sbert_model_paths = [os.path.join(dict_dir,m+'.pkl') for m in sbert_models]

### Spacy precomputed vectors
spacy_models      = ["en_core_web_sm"]
spacy_model_paths = [os.path.join(dict_dir,m+'.pkl') for m in spacy_models]

### All precomputed vectors
model_paths = [*sbert_model_paths,*spacy_model_paths]

In [8]:
PRETRAINED_QA_PARAMS = {
    "Input":{"question":{"vectorizer":{"model":model_paths,"env":['saved-dictionary']}},"answer":{"vectorizer":{"model":model_paths,"env":['saved-dictionary']}}},
}
PRETRAINED_A_PARAMS = {
    "Input":{"question":["drop"],"answer":{"vectorizer":{"model":model_paths,"env":['saved-dictionary']}}},
}

SKLEARN_QA_PARAMS  = {
    "Input":{
        "question":{"vectorizer":{"model":['TfidfVectorizer','CountVectorizer'],"env":['scikit-learn'],'max_features':np.arange(1000,6500,1000)}},
        "answer":{"vectorizer":{"model":['TfidfVectorizer','CountVectorizer'],"env":['scikit-learn'],'max_features':np.arange(1500,8500,500)}}
    },    
}
SKLEARN_A_PARAMS  = {
    "Input":{
        "question":["drop"],
        "answer":{"vectorizer":{"model":['TfidfVectorizer','CountVectorizer'],"env":['scikit-learn'],'max_features':np.arange(1500,8500,500)}}
    },    
}
LOGISTIC_PARAMS      = {"Classifier":{"model":["LogisticRegression"],"C":[0.01,0.1],'penalty':['l1','l2'],'solver':['saga']}}
RANDOM_FOREST_PARAMS = {"Classifier":{"model":["RandomForestClassifier"],"n_estimators":[100,200],'max_depth':[5,10,25]}}
SGD_PARAMS          = {"Classifier":{"model":["SGDClassifier"],"loss":["hinge","log"],"penalty":["l2","l1"]}}
SVM_PARAMS          = {"Classifier":{"model":["SVC"],"C":[0.01,0.1],'kernel':['linear','rbf']}}
KNN_PARAMS          = {"Classifier":{"model":["KNeighborsClassifier"],"n_neighbors":[5,10,15],'weights':['uniform','distance'],'algorithm':['auto']}}
MLP_PARAMS          = {"Classifier":{"model":["MLPClassifier"],"hidden_layer_sizes":[(100,),(200,)],'activation':['relu','tanh'],'solver':['adam']}}

VECTORIZATION_PARAMS  = [PRETRAINED_QA_PARAMS,PRETRAINED_A_PARAMS,SKLEARN_QA_PARAMS,SKLEARN_A_PARAMS]
CLASSIFIER_PARAMS      = [LOGISTIC_PARAMS,RANDOM_FOREST_PARAMS,SGD_PARAMS,SVM_PARAMS,KNN_PARAMS,MLP_PARAMS]
ALL_PARAM_COMBINATIONS = [{**vect_params, **clf_params} for vect_params in VECTORIZATION_PARAMS for clf_params in CLASSIFIER_PARAMS]
CV_SEARCH_PARAMS = [crossval.convert_nested_params(p) for p in ALL_PARAM_COMBINATIONS]
print(f"Number of hyperparameter configurations: {crossval.count_hyperparameters(CV_SEARCH_PARAMS)}")

Number of hyperparameter configurations: 504


> ## Model training:
> Cross-validate over hyperparameters and select the best model

In [9]:
NUM_BOOTSTRAP_RUNS = 10

In [None]:
bootstrap_dfs = []
fitted_models = {}

for bootstrap_run in range(1,NUM_BOOTSTRAP_RUNS+1,1):
    
    fitted_models[bootstrap_run] = {}
    
    train_df, test_df = train_test_split(human_coded_df,test_size=0.25)
    
    train_df['split']   = 'train'
    test_df['split']    = 'test'
    uncoded_df['split'] = 'unannotated'
    
    boot_df = pd.concat([train_df,test_df,uncoded_df])
    boot_df['bootstrap_run'] = bootstrap_run
    
    for code_var in code_variables:
        
        print(f"Bootstrap {bootstrap_run} | Annotation: {code_var}")
        
        ### Create X and y
        X = train_df[[question_col,answer_col]]
        y = train_df[code_var]

        iqual_model = nlpmodel.NLPModel()
        iqual_model.add_text_features(question_col,answer_col,model='TfidfVectorizer',env='scikit-learn')
        iqual_model.add_classifier(name="LogisticRegression")
        iqual_model.add_threshold()
        iqual_model.compile()
        cv_dict = iqual_model.cross_validate_fit(
            X,y,                                # X: Pandas DataFrame of features, y: Pandas Series of labels
            search_parameters=CV_SEARCH_PARAMS, # search_parameters: Dictionary of parameters to use for cross-validation
            cv_method='RandomizedSearchCV',     # cv_method: Cross-validation method to use, options: GridSearchCV, RandomizedSearchCV
            n_iter=10,                          # n_iter: Only when cv_method='RandomizedSearchCV'
            scoring=scoring_dict,               # scoring: Scoring metric to use for cross-validation    
            refit='f1',                         # refit: Metric to use for refitting the model
            n_jobs=-1,                          # n_jobs: Number of parallel threads to use  
            cv_splits=3,                        # cv_splits: Number of cross-validation splits
        )
        print()
        
        print("\n\nAverage F1 score for {code_var}: {score:.3f}".format(code_var=code_var,score=cv_dict['avg_test_score']),end='\n\n')

        boot_df[code_var+"_pred"]  = iqual_model.predict(boot_df[['Q_en','A_en']])        
        fitted_models[bootstrap_run][code_var] = iqual_model
    
    # Append after each bootstrap run
    bootstrap_dfs.append(boot_df)
    
bootstrap_data = pd.concat(bootstrap_dfs,axis=0)

Bootstrap 1 | Annotation: religious
.......504 hyperparameters configurations possible.....


Average F1 score for religious: 0.593

Bootstrap 1 | Annotation: secular
.......504 hyperparameters configurations possible.....