# Model - Multiple Annotations

> ### Imports

In [1]:
import os
import pandas as pd
from iqual import iqualnlp, evaluation, crossval

> ### Load `annotated (human-coded)` and `unannotated` datasets

In [2]:
data_dir         = "../../data"
human_coded_df   = pd.read_csv(os.path.join(data_dir,"annotated.csv"))
uncoded_df       = pd.read_csv(os.path.join(data_dir,"unannotated.csv"))

> ### Split the data into training and test sets

In [3]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(human_coded_df,test_size=0.25)
print(f"Train Size: {len(train_df)}\nTest Size: {len(test_df)}")

Train Size: 7470
Test Size: 2490


> ### Configure training data

In [4]:
### Select Question and Answer Columns
question_col = 'Q_en'
answer_col   = 'A_en'

### Select a code
code_variables = ['religious','migration','entrepreneur','secular','marriage']

In [5]:
# Scorig Dict for evaluation
scoring_dict = evaluation.get_scoring_dict(['f1'])

> ### Configure a Hyperparameter Grid for cross-validation + fitting

In [3]:
## Paths for precomputed vectors created using `sentence-transformers`
dict_dir          = "../dictionaries"
sbert_models      = ["all-mpnet-base-v2", "distiluse-base-multilingual-cased-v2"]
sbert_model_paths = [os.path.join(dict_dir,m+'.pkl') for m in sbert_models]

In [14]:
SBERT_QA_PARAMS = {
    "Input":{
        "question":{
            "vectorizer":{
                        "model":sbert_model_paths,
                        "env":["saved-dictionary"],               
                         },
        },
        "answer":{
            "vectorizer":{
                        "model":sbert_model_paths,
                        "env":["saved-dictionary"],                
                         },                        
        },
    }
}
SBERT_A_PARAMS = {
    "Input":{
        "question":"drop",
        "answer":{
            "vectorizer":{
                        "model":sbert_model_paths,
                        "env":["saved-dictionary"],
                        },
        }
}
}

SKLEARN_QA_PARAMS =     {
    "Input":{
        "question":{
            "vectorizer":{
                        "model":['TfidfVectorizer','CountVectorizer'],
                        "max_features":[500,1000,1500,2500,],
                        "env":["scikit-learn"],               
                         },
        },
        "answer":{
            "vectorizer":{
                        "model":['TfidfVectorizer','CountVectorizer'],
                        "max_features":[1500,2500,4000,],
                        "env":["scikit-learn"],                
                         },                        
        },
    }
}

SKLEARN_A_PARAMS = {
    "Input":{
        "question":"drop",
        "answer":{
            "vectorizer":{
                        "model":['TfidfVectorizer','CountVectorizer'],
                        "max_features":[1500,2500,4000,],
                        "env":["scikit-learn"],
                            },
        }   
    }
}

LOGISTIC_PARAMS = {       
    "Classifier":{
            "model":["LogisticRegression"],
            "C":[0.01,0.1],
        },
}

RANDOM_FOREST_PARAMS = {
    "Classifier":{
            "model":["RandomForestClassifier"],
            "n_estimators":[100,200],
            "max_depth":[5,10,15],
        },
}

SGD_PARAMS = {
    "Classifier":{
            "model":["SGDClassifier"],
            "loss":["hinge","log"],
            "alpha":[0.0001,0.001],
        },
}

### Combine a Vectorizer and Classifier
VECTORIZATION_PARAMS = [SKLEARN_QA_PARAMS,SKLEARN_A_PARAMS,SBERT_QA_PARAMS,SBERT_A_PARAMS]
CLASSIFIER_PARAMS    = [LOGISTIC_PARAMS,RANDOM_FOREST_PARAMS,SGD_PARAMS]

params_all = [{**vect_params, **clf_params} for vect_params in VECTORIZATION_PARAMS for clf_params in CLASSIFIER_PARAMS]
CV_SEARCH_PARAMS = [crossval.convert_nested_params(params) for params in params_all]

> ## Model training:
> Cross-validate over hyperparameters and select the best model

In [21]:
fitted_models = {}
for code_var in code_variables:
    print(code_var)
    
    ### Create X and y
    X = train_df[[question_col,answer_col]]
    y = train_df[code_var]
    
    iqual_model = iqualnlp.Model()
    iqual_model.add_text_features(question_col,answer_col,model='TfidfVectorizer',env='scikit-learn')
    iqual_model.add_classifier(name="LogisticRegression")
    iqual_model.add_threshold()
    iqual_model.compile()
    cv_dict = iqual_model.cross_validate_fit(
        X,y,                                # X: Pandas DataFrame of features, y: Pandas Series of labels
        search_parameters=CV_SEARCH_PARAMS, # search_parameters: Dictionary of parameters to use for cross-validation
        cv_method='RandomizedSearchCV',     # cv_method: Cross-validation method to use, options: GridSearchCV, RandomizedSearchCV
        n_iter=10,                          # n_iter: Only when cv_method='RandomizedSearchCV'
        scoring=scoring_dict,               # scoring: Scoring metric to use for cross-validation    
        refit='f1',                         # refit: Metric to use for refitting the model
        n_jobs=-1,                          # n_jobs: Number of parallel threads to use  
        cv_splits=3,                        # cv_splits: Number of cross-validation splits
    )
    print()
    print()
    print("Average F1 score for {code_var}: {score:.3f}".format(code_var=code_var,score=cv_dict['avg_test_score']))
    
    # Save fitted model to a dictionary
    fitted_models[code_var] = iqual_model

religious
.......720 hyperparameters configurations possible.....
Average F1 score for religious: 0.628
migration
.......720 hyperparameters configurations possible.....
Average F1 score for migration: 0.653
entrepreneur
.......720 hyperparameters configurations possible.....
Average F1 score for entrepreneur: 0.680
secular
.......720 hyperparameters configurations possible.....
Average F1 score for secular: 0.498
marriage
.......720 hyperparameters configurations possible.....
Average F1 score for marriage: 0.810


### Evaluate model using out sample data (Held out human-coded data)

In [23]:
for code_var in code_variables:
    test_pred = fitted_models[code_var].predict(test_df[['Q_en','A_en']])
    test_act  = test_df[code_var].tolist()
    f1_score  = evaluation.calc_f1_score_from_labels(test_pred,test_act)
    print(f"Out-sample F1-score for {code_var} is : {f1_score:.3f}")

Out-sample F1-score for religious is : 0.651
Out-sample F1-score for migration is : 0.667
Out-sample F1-score for entrepreneur is : 0.619
Out-sample F1-score for secular is : 0.440
Out-sample F1-score for marriage is : 0.830


### Predict labels for unannotated data

In [30]:
for code_var in code_variables:
    uncoded_df[code_var+'_pred'] = fitted_models[code_var].predict(uncoded_df[['Q_en','A_en']])
    print(f"\tExamples of positive {code_var} predictions:\n")
    print('\t===============================================\n\n')
    for idx, row in uncoded_df.loc[(uncoded_df[code_var+"_pred"]==1),['Q_en','A_en']].sample(3).iterrows():
        print("Q: ",row['Q_en'],"\n","A: ", row['A_en'],sep='')
        print()

	Examples of positive religious predictions:



Q: Bro, what kind of work do you think Nur Arafat will do when he grows up?
A: Maulana will grow up to serve religion.

Q: If God blesses you, who will be your sons when they grow up?
A: My son Yahya will be the Mufti of the Fatwa Department, a researcher of the Big Book (Hadith Sharif). Hafez will be in the Koran.

Q: So what did you say, brother?
A: The knowledge of the world is called the world. There is a knowledge of the hereafter. If my son can teach people about the hereafter in madrasas, mosques. I like that.

	Examples of positive migration predictions:



Q: What other dreams do you have for him after making him master, tell me a little?
A: Apart from that, his necessary travel documents, clothes, etc. are all these.

Q: Yes, how to do that?
A: I will send visa abroad with a relative.

Q: What to send abroad for?
A: For the good of the nation.

	Examples of positive entrepreneur predictions:



Q: What do you dream about your ch

In [36]:
for code_var in code_variables:
    best_params = fitted_models[code_var].cv.get_best_params()
    print(f"\tBest parameters for {code_var}:\n\n",best_params,end='\n\n')
    

	Best parameters for religious:

 {'Input__question__vectorizer__model': 'TfidfVectorizer', 'Input__question__vectorizer__max_features': 500, 'Input__question__vectorizer__env': 'scikit-learn', 'Input__answer__vectorizer__model': 'TfidfVectorizer', 'Input__answer__vectorizer__max_features': 4000, 'Input__answer__vectorizer__env': 'scikit-learn', 'Classifier__n_estimators': 100, 'Classifier__model': 'RandomForestClassifier', 'Classifier__max_depth': 10}

	Best parameters for migration:

 {'Input__answer__vectorizer__model': '../dictionaries\\distiluse-base-multilingual-cased-v2.pkl', 'Input__answer__vectorizer__env': 'saved-dictionary', 'Classifier__model': 'SGDClassifier', 'Classifier__loss': 'hinge', 'Classifier__alpha': 0.0001}

	Best parameters for entrepreneur:

 {'Input__question__vectorizer__model': 'CountVectorizer', 'Input__question__vectorizer__max_features': 2500, 'Input__question__vectorizer__env': 'scikit-learn', 'Input__answer__vectorizer__model': 'CountVectorizer', 'Input