In [1]:
import os
import sys
import numpy as np
import pandas as pd

git_dir = r"C:/Users/Aditya/GitHub/isquash"
sys.path.append(git_dir)
sys.path.append(os.path.join(git_dir,'isquash'))
pd.set_option('display.max_columns',500)

Directory changed to C:/Users/Aditya/GitHub/isquash


In [2]:
from isquash import nlpmodel, preprocessing, evaluation,crossval

### Load `annotated (human-coded)` and `unannotated` datasets

In [130]:
human_coded_df   = pd.read_csv("data/annotated.csv")
uncoded_df       = pd.read_csv("data/unannotated.csv")

In [132]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(human_coded_df,test_size=0.25)
print(f"Train Size: {len(train_df)}\nTest Size: {len(test_df)}")

Train Size: 7470
Test Size: 2490


In [89]:
question_col, answer_col   = 'Q_en', 'A_en'

In [138]:
### Select a code
code_variable = 'marriage'

X = train_df[[question_col,answer_col]]
y = train_df[code_variable]

### Initiate model

In [139]:
# Initiate the model class
classifier = nlpmodel.NLPModel()

# Add text columns, and choose a feature extraction model (Available options: scikit-learn, spacy, sentence-transformers, precomputed (picklized dictionary))
classifier.add_text_features(question_col,answer_col,model='TfidfVectorizer')

# Choose a primary classifier model 
classifier.add_estimation(name = "LogisticRegression")

# Add a threshold layer 
classifier.add_threshold()

# Call `compile()` to finish the setup
classifier.compile()

In [114]:
scoring_dict = evaluation.get_scoring_dict(['f1'])

### Configure a Hyperparameter Grid for cross-validation + fitting

In [140]:
search_param_config = {
    "Input":{
        "question":{
            "vectorizer":{
                        "model":["TfidfVectorizer"],
                         "env":["scikit-learn-vectorizer"],               
                        "max_features":[1000,2000],
                        "ngram_range":[(1,2)],
                         },
        },
        "answer":{
            "vectorizer":{
                        "model":["TfidfVectorizer"],
                        "env":["scikit-learn-vectorizer"],                
                        "max_features":[1000,2000],
                        "ngram_range":[(1,2)],
                         },                        
        },
    },
        "Estimation":{
            "model":["LogisticRegression"],
            "C":[0.001,0.01, 0.1],
        },
}

CV_SEARCH_PARAMS = crossval.convert_nested_params(search_param_config)

## Model training:
> Cross-validate over hyperparameters and select the best model

In [141]:
cv_dict = classifier.cross_validate_fit(
    X,y,
    search_parameters=CV_SEARCH_PARAMS,
    cv_method='GridSearchCV',
    scoring=scoring_dict,
    refit='f1',
    n_jobs=-1,                 # n_jobs: Number of parallel threads to use  
    cv_splits=3,
)


Average CV score: 0.758rs configurations possible.....


In [146]:
print("Average F1 score: {:.3f}".format(cv_dict['avg_test_score']))

Average F1 score: 0.758


### Evaluate model using out sample data (Held out human-coded data)

In [150]:
test_pred = classifier.predict(test_df[['Q_en','A_en']])
test_act  = test_df[code_variable].tolist()

In [151]:
f1_score = evaluation.calc_f1_score_from_labels(test_pred,test_act,)
print(f"Out-sample F1-score: {f1_score:.3f}")

Out-sample F1-score: 0.788


### Predict labels for unannotated data

In [152]:
uncoded_df[code_variable+'_pred'] = classifier.predict(uncoded_df[['Q_en','A_en']])

In [153]:
dict(uncoded_df[code_variable+"_pred"].value_counts())

{0: 21613, 1: 890}

In [161]:
for idx, row in uncoded_df.loc[(uncoded_df[code_variable+"_pred"]==1),['Q_en','A_en']].sample(5).iterrows():
    print("Q: ",row['Q_en'],"\n","A: ", row['A_en'],sep='')
    print()

Q: Well What will you do?
A: Until she gets married, she will be busy with her studies

Q: What dreams do you have especially about the eldest child?
A: I will get married after studying.

Q: Gee, what hope do you have for them?
A: The one who is studying will finish his studies well and become a shopkeeper or something and the rest will grow up and get married. I will marry even if it is difficult.

Q: Parents have many hopes for their eldest child. What other hopes do you have for him?
A: I have no hope but to marry her.

Q: Do you have any wish or hope that the sewing machine will work?
A: If I get a good relationship, I will give this much hope for marriage.

