# LASER + Logistic Regresion (Monolingual-Train-Monolingual-Test)

In [156]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score, f1_score,confusion_matrix

## Utility Functions

In [117]:
# Ensure that you have the API running before running this.
def generate_LASER_embedding(sequence, lang='en'):
    if not sequence or len(sequence) < 0:
        print('Please pass in an input sequence to generate an embedding.')
    url = "http://128.61.46.191:80/vectorize"
    params = {"q": sequence, "lang": lang}
    resp = requests.get(url=url, params=params).json()
    arr = np.array(resp["embedding"])
    return arr

In [122]:
def generate_embeddings_for_dataset(ifname, lang):
    '''
    Generates embeddings and saves it into a CSV given an input file path and a language.
    '''
    print(f'Generating input embeddings for {ifname}')
    embeddings = []
    df = pd.read_csv(f'{DATA_DIR}/{ifname}')
    print(f'Total embeddings to generate: {df.shape[0]}')
    for idx, row in df.iterrows():
        if (idx+1) % (df.shape[0] // 25) == 0:
            print(f'Completed {idx+1} embeddings')
        embedding = generate_LASER_embedding(row['text'], lang=lang)[0]
        embedding = np.append(embedding, [int(row['hs'])])
        embeddings.append(embedding)
    embeddings = np.array(embeddings)
    # Save each entry of the embedding array as a column in the dataframe.
    columns = [str(i) for i in range(1024)]
    columns.append('hs')
    embedding = pd.DataFrame(embeddings, columns=columns)
    embedding.hs = embedding.hs.astype(int)
    ofname = ifname.split('.')[0] + '_embeddings.csv'
    embedding.to_csv(f'{DATA_DIR}/{ofname}', index=False)
    print(f'Wrote embedding dataframe to {DATA_DIR}/{ofname}')

In [159]:
def train_and_test_LR_model(ifname, lang, c=0.01):
    df = pd.read_csv(f'{DATA_DIR}/{ifname}')
    print(f'Training model for {lang} on {ifname}')
    X_cols = [str(i) for i in range(1024)]
    X, y = df[X_cols], df['hs']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    model = LogisticRegression(C=c, solver='lbfgs', class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    print('Training complete!')
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    f_score = f1_score(y_test, y_pred ,average='macro')
    print(f'Macro F1: {f_score}')

In [123]:
DATA_DIR = '../data/all-processed'

## French (Ousidhoum et al.)

In [124]:
ifname = 'B_french_ousidhoum_processed.csv'
df = pd.read_csv(f'{DATA_DIR}/{ifname}')

In [125]:
generate_embeddings_for_dataset(ifname, 'fr')

Generating input embeddings for B_french_ousidhoum_processed.csv
Total embeddings to generate: 1028
Completed 41 embeddings
Completed 82 embeddings
Completed 123 embeddings
Completed 164 embeddings
Completed 205 embeddings
Completed 246 embeddings
Completed 287 embeddings
Completed 328 embeddings
Completed 369 embeddings
Completed 410 embeddings
Completed 451 embeddings
Completed 492 embeddings
Completed 533 embeddings
Completed 574 embeddings
Completed 615 embeddings
Completed 656 embeddings
Completed 697 embeddings
Completed 738 embeddings
Completed 779 embeddings
Completed 820 embeddings
Completed 861 embeddings
Completed 902 embeddings
Completed 943 embeddings
Completed 984 embeddings
Completed 1025 embeddings
Wrote embedding dataframe to ../data/all-processed/B_french_ousidhoum_processed_embeddings.csv


In [160]:
train_and_test_LR_model('B_french_ousidhoum_processed_embeddings.csv', 'French')

Training model for French on B_french_ousidhoum_processed_embeddings.csv
Training complete!
Accuracy: 0.6516129032258065
Macro F1: 0.5826685281212605
