In [None]:
import matplotlib
import matplotlib.pyplot as plt
import smart_open
smart_open.open = smart_open.smart_open
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from random import shuffle
from collections import defaultdict
from itertools import chain

In [None]:
df_definitional_pairs_100 = pd.read_excel("../data/100_wordpairs.xlsx")
definitional_pairs_100 = list(zip(df_definitional_pairs_100.Profan.tolist(), df_definitional_pairs_100.Neutral.tolist()))
test_words = pd.read_excel("../data/test_words.xlsx")
human_nouns = test_words.nouns_human.tolist()
general_nouns = test_words.nouns_general.tolist()
verbs_adjs = test_words.verbs_adjectives.tolist()
test_labels = (["OFF"] * 25) + (["OTH"] * 25)
model = KeyedVectors.load_word2vec_format('../data/embed_tweets_de_300D_fasttext')

In [1]:
def PCA_LDA(pairs, embedding, test_words, test_labels, normalize=False, num_components = 10):
    matrix = []
    word_labels = []
    for a, b in pairs:
        if normalize:
            center = (embedding[a] + embedding[b])/2
            matrix.append(embedding[a] - center)
            matrix.append(embedding[b] - center)
        else:
            matrix.append(embedding[a])
            matrix.append(embedding[b])
        word_labels.append("OFF")
        word_labels.append("OTH")
    matrix = np.array(matrix)
    pca = PCA(n_components = num_components)

    results = {}
    pca_results = pca.fit_transform(matrix)
    lda = LDA(n_components=1)
    lda.fit(pca_results, word_labels)
    
    test_words_embeddings = np.array([embedding[w] for w in test_words])
    pca_results_test = pca.transform(test_words_embeddings)
    predictions = lda.predict(pca_results_test)

    results['cm'] = confusion_matrix(y_true=arab_labels,y_pred=predictions)
    results['f1_micro'] = f1_score(y_true=test_labels,y_pred=predictions,average='micro')
    results['f1_macro'] = f1_score(y_true=test_labels,y_pred=predictions,average='macro')
    return results["f1_macro"]

In [2]:
def doLDA(pairs, embedding, test_words, test_labels):
    matrix = []
    word_labels = []
    for a, b in pairs:
        matrix.append(embedding[a])
        matrix.append(embedding[b])
        word_labels.append("OFF")
        word_labels.append("OTH")
    matrix = np.array(matrix)

    results = {}

    lda = LDA(n_components=1)
    lda.fit(matrix, word_labels)
    
    test_words_embeddings = np.array([embedding[w] for w in test_words])
    predictions = lda.predict(test_words_embeddings)
    
    results['cm'] = confusion_matrix(y_true=arab_labels,y_pred=predictions)
    results['f1_micro'] = f1_score(y_true=test_labels,y_pred=predictions,average='micro')
    results['f1_macro'] = f1_score(y_true=test_labels,y_pred=predictions,average='macro')

    return results["f1_macro"]

# Word Classification Experiment: Decreasing Word Training Pairs

In [None]:
#based on PCA-LDA: Find best number of PCs
best_pcs = defaultdict()
for step_size in range(10,101,10): #increase number of word pairs by 10
    pc_results = defaultdict(list)
    for i in range(10): #10-fold
        shuffle(definitional_pairs_100)
        sample = definitional_pairs_100[:step_size]
        sample_test = list(chain(*definitional_pairs_100[-10:]))
        sample_test_labels = ["OFF", "OTH"] * 10

        for pc in range(1, len(sample)*2): #for each possible number of PCs
            f1 = PCA_LDA(sample, model, sample_test, sample_test_labels, normalize=False,num_components=pc)
            pc_results[pc].append(f1)
    best_folds = []
    for k,v in pc_results.items():
        #print(k, max(v), np.std(v))
        best_folds.append(max(v) - np.std(v))
    best_pcs[step_size] = best_folds.index(max(best_folds)) + 1
print(best_pcs)

In [None]:
results_human, results_noun, results_verb = [],[],[]
for step_size in range(10,101,10): #increase number of word pairs by 10
    kfold_results_human, kfold_results_noun, kfold_results_verb = [],[],[]
    for i in range(10): #10-fold
        shuffle(definitional_pairs_100)
        sample = definitional_pairs_100[:step_size+10]
        f1_human = PCA_LDA(sample, model, human_nouns, test_labels, normalize=False,num_components=best_pcs[step_size])
        kfold_results_human.append(f1_human)
        f1_noun = PCA_LDA(sample, model, general_nouns, test_labels, normalize=False,num_components=best_pcs[step_size])
        kfold_results_noun.append(f1_noun)
        f1_verb = PCA_LDA(sample, model, verbs_adjs, test_labels, normalize=False,num_components=best_pcs[step_size])
        kfold_results_verb.append(f1_verb)
    results_human.append(np.mean(kfold_results_human))
    results_noun.append(np.mean(kfold_results_noun))
    results_verb.append(np.mean(kfold_results_verb))

In [None]:
#based on PCA-LDA-NORM
best_pcs = defaultdict()
for step_size in range(10,101,10): #increase number of word pairs by 10
    pc_results = defaultdict(list)
    for i in range(10): #10-fold
        shuffle(definitional_pairs_100)
        sample = definitional_pairs_100[:step_size]
        sample_test = list(chain(*definitional_pairs_100[-10:]))
        sample_test_labels = ["OFF", "OTH"] * 10
        for pc in range(1, len(sample)*2): #for each possible number of PCs
            f1 = PCA_LDA(sample, model, sample_test, sample_test_labels, normalize=True,num_components=pc)
            pc_results[pc].append(f1)
    best_folds = []
    for k,v in pc_results.items():
        #print(k, max(v), np.std(v))
        best_folds.append(max(v) - np.std(v))
    best_pcs[step_size] = best_folds.index(max(best_folds)) + 1
print(best_pcs)

results_human_norm, results_noun_norm, results_verb_norm = [],[],[]
for step_size in range(10,101,10): #increase number of word pairs by 10
    kfold_results_human, kfold_results_noun, kfold_results_verb = [],[],[]
    for i in range(10): #10-fold
        shuffle(definitional_pairs_100)
        sample = definitional_pairs_100[:step_size+10]
        f1_human = PCA_LDA(sample, model, human_nouns, test_labels, normalize=False,num_components=best_pcs[step_size])
        kfold_results_human.append(f1_human)
        f1_noun = PCA_LDA(sample, model, general_nouns, test_labels, normalize=False,num_components=best_pcs[step_size])
        kfold_results_noun.append(f1_noun)
        f1_verb = PCA_LDA(sample, model, verbs_adjs, test_labels, normalize=False,num_components=best_pcs[step_size])
        kfold_results_verb.append(f1_verb)
    results_human_norm.append(np.mean(kfold_results_human))
    results_noun_norm.append(np.mean(kfold_results_noun))
    results_verb_norm.append(np.mean(kfold_results_verb))

In [None]:
#based on LDA
results_human_LDA, results_noun_LDA, results_verb_LDA = [],[],[]
for step_size in range(10,101,10): #increase number of word pairs by 10
    kfold_results_human, kfold_results_noun, kfold_results_verb = [],[],[]
    for i in range(10): #10-fold
        shuffle(definitional_pairs_100)
        sample = definitional_pairs_100[:step_size+10]
        f1_human = doLDA(sample, model, human_nouns, test_labels)
        kfold_results_human.append(f1_human)
        f1_noun = doLDA(sample, model, general_nouns, test_labels)
        kfold_results_noun.append(f1_noun)
        f1_verb = doLDA(sample, model, verbs_adjs, test_labels)
        kfold_results_verb.append(f1_verb)
    results_human_LDA.append(np.mean(kfold_results_human))
    results_noun_LDA.append(np.mean(kfold_results_noun))
    results_verb_LDA.append(np.mean(kfold_results_verb))  