# Model - Multiple Vectorizers

> ### Imports

In [1]:
import os
import pandas as pd
from iqual import iqualnlp, evaluation, crossval, vectorizers

> ### Load `annotated (human-coded)` and `unannotated` datasets

In [2]:
data_dir         = "../../data"
human_coded_df   = pd.read_csv(os.path.join(data_dir,"annotated.csv"))
uncoded_df       = pd.read_csv(os.path.join(data_dir,"unannotated.csv"))

> ### Load `spacy` or `sentence-transformers` using precomputed dictionaries

In [3]:
dict_dir        = "../dictionaries"

In [4]:
sentence_transformer_models = ["all-mpnet-base-v2", "all-roberta-large-v1","distiluse-base-multilingual-cased-v2"]
spacy_models                = ["en_core_web_sm","en_core_web_md","en_core_web_lg"]
model_paths  = [os.path.join(dict_dir,m+'.pkl') for m in [*sentence_transformer_models,*spacy_models]]

> ### Split the data into training and test sets

In [5]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(human_coded_df,test_size=0.25)
print(f"Train Size: {len(train_df)}\nTest Size: {len(test_df)}")

Train Size: 7470
Test Size: 2490


> ### Configure training data

In [6]:
### Select Question and Answer Columns
question_col = 'Q_en'
answer_col   = 'A_en'

### Select a code
code_variable = 'marriage'

### Create X and y
X = train_df[[question_col,answer_col]]
y = train_df[code_variable]

### Initiate model

In [7]:
# Step 1: Initiate the model class
iqual_model = iqualnlp.Model()

# Step 2: Add layers to the model
#  Add text columns, and choose a feature extraction model (Available options: scikit-learn, spacy, sentence-transformers, saved-dictionary (picklized dictionary))
iqual_model.add_text_features(question_col,answer_col,model='../dictionaries/all-mpnet-base-v2.pkl',env='saved-dictionary')

# Step 3: Add a feature transforming layer (optional)
# A. Choose a feature-scaler. Available options: 
# any scikit-learn scaler from `sklearn.preprocessing`
### iqual_model.add_feature_transformer(name='StandardScaler', transformation="FeatureScaler")
# OR
# B. Choose a dimensionality reduction model. Available options:
# - Any scikit-learn dimensionality reduction model from `sklearn.decomposition`
# - Uniform Manifold Approximation and Projection (UMAP) using umap.UMAP (https://umap-learn.readthedocs.io/en/latest/)

### iqual_model.add_feature_transformer(name='PCA', transformation="DimensionalityReduction")

# Step 4: Add a classifier layer
# Choose a primary classifier model  (Available options: any scikit-learn classifier)
iqual_model.add_classifier(name="LogisticRegression")

# Step 5: Add a threshold layer. This is optional, but recommended for binary classification
iqual_model.add_threshold()

# Step 6: Compile the model
iqual_model.compile()

In [8]:
# Scorig Dict for evaluation
scoring_dict = evaluation.get_scoring_dict(['f1'])

> ### Configure a Hyperparameter Grid for cross-validation + fitting

In [9]:
model_paths = [os.path.join(dict_dir,model+'.pkl') for model in [*spacy_models,*sentence_transformer_models]]

In [10]:
# Saved-dictionary (precomputed vectors using spacy/sentence-transformers)
params_saved = {
    "Input":{
        "question":{
            "vectorizer":{
                        "model":model_paths,
                        "env":["saved-dictionary"],               
                         },
        },
        "answer":{
            "vectorizer":{
                        "model":model_paths,
                        "env":["saved-dictionary"],                
                         },                        
        },
    },
        
    "Classifier":{
            "model":["LogisticRegression"],
            "C":[0.01,0.1],
        },
}

# Scikit-learn vectors (TfidfVectorizer/CountVectorizer)

params_sklearn = {
    "Input":{
        "question":{
            "vectorizer":{
                        "model":['TfidfVectorizer','CountVectorizer'],
                        "max_features":[500,1000,1500,2500,],
                        "env":["scikit-learn"],               
                         },
        },
        "answer":{
            "vectorizer":{
                        "model":['TfidfVectorizer','CountVectorizer'],
                        "max_features":[1500,2500,4000,],
                        "env":["scikit-learn"],                
                         },                        
        },
    },
        
    "Classifier":{
            "model":["LogisticRegression"],
            "C":[0.01,0.1],
        },
}

CV_SEARCH_PARAMS = [
    crossval.convert_nested_params(params_saved),
    crossval.convert_nested_params(params_sklearn)
]

In [11]:
CV_SEARCH_PARAMS

[{'Input__question__vectorizer__model': ['../dictionaries\\en_core_web_sm.pkl',
   '../dictionaries\\en_core_web_md.pkl',
   '../dictionaries\\en_core_web_lg.pkl',
   '../dictionaries\\all-mpnet-base-v2.pkl',
   '../dictionaries\\all-roberta-large-v1.pkl',
   '../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'],
  'Input__question__vectorizer__env': ['saved-dictionary'],
  'Input__answer__vectorizer__model': ['../dictionaries\\en_core_web_sm.pkl',
   '../dictionaries\\en_core_web_md.pkl',
   '../dictionaries\\en_core_web_lg.pkl',
   '../dictionaries\\all-mpnet-base-v2.pkl',
   '../dictionaries\\all-roberta-large-v1.pkl',
   '../dictionaries\\distiluse-base-multilingual-cased-v2.pkl'],
  'Input__answer__vectorizer__env': ['saved-dictionary'],
  'Classifier__model': ['LogisticRegression'],
  'Classifier__C': [0.01, 0.1]},
 {'Input__question__vectorizer__model': ['TfidfVectorizer', 'CountVectorizer'],
  'Input__question__vectorizer__max_features': [500, 1000, 1500, 2500],
  'Input

> ## Model training:
> Cross-validate over hyperparameters and select the best model

In [12]:
cv_dict = iqual_model.cross_validate_fit(
    X,y,                                # X: Pandas DataFrame of features, y: Pandas Series of labels
    search_parameters=CV_SEARCH_PARAMS, # search_parameters: Dictionary of parameters to use for cross-validation
    cv_method='RandomizedSearchCV',     # cv_method: Cross-validation method to use, options: GridSearchCV, RandomizedSearchCV
    n_iter=30,
    scoring=scoring_dict,               # scoring: Scoring metric to use for cross-validation    
    refit='f1',                         # refit: Metric to use for refitting the model
    n_jobs=-1,                          # n_jobs: Number of parallel threads to use  
    cv_splits=3,                        # cv_splits: Number of cross-validation splits
)

.......168 hyperparameters configurations possible.....

In [13]:
print("Average F1 score: {:.3f}".format(cv_dict['avg_test_score']))

Average F1 score: 0.813


### Evaluate model using out sample data (Held out human-coded data)

In [14]:
test_pred = iqual_model.predict(test_df[['Q_en','A_en']])
test_act  = test_df[code_variable].tolist()

f1_score = evaluation.calc_f1_score_from_labels(test_pred,test_act,)
print(f"Out-sample F1-score: {f1_score:.3f}")

Out-sample F1-score: 0.824


### Predict labels for unannotated data

In [15]:
uncoded_df[code_variable+'_pred'] = iqual_model.predict(uncoded_df[['Q_en','A_en']])

In [16]:
dict(uncoded_df[code_variable+"_pred"].value_counts())

{0: 21515, 1: 988}

In [20]:
for idx, row in uncoded_df.loc[(uncoded_df[code_variable+"_pred"]==1),['Q_en','A_en']].sample(3).iterrows():
    print("Q: ",row['Q_en'],"\n","A: ", row['A_en'],sep='')
    print()

Q: What else do you expect from him after becoming a policeman?
A: He hopes to marry her after that.

Q: Will you marry the girl or get a job?
A: If a girl wants to study and get a job, she will get a job, if she wants to get married, she will get married.

Q: What dreams do you have about your children?
A: I like to make my daughter study more. I want to marry my daughter if she passes her degree and gets a good boy.

