In [1]:
from numpy import genfromtxt
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
sentences = pd.read_csv('../data/processed/temple_radio_1_2_sentences_with_translation.csv')
sent_embeddings = genfromtxt('../data/processed/temple_radio_1_2_sentence_embeddings.csv', delimiter=',')

In [3]:
pos_sent = sentences[(sentences['Translation'] == 'The liver is enlarged.')]

In [4]:
pos_sent_embeddings = sent_embeddings[pos_sent.index, :]
neg_sent = sentences.drop(pos_sent.index)
neg_sent_embeddings = sent_embeddings[neg_sent.index, :]

In [5]:
neg_sent_embeddings = sent_embeddings[neg_sent.index, :]

In [6]:
pos_sent = pos_sent.reset_index(drop=True)
neg_sent = neg_sent.reset_index(drop=True)

In [7]:
new_sent_df = pd.concat([pos_sent, neg_sent]).reset_index(drop=True)

In [8]:
y_pos = [1 for p in range(len(pos_sent_embeddings))]
y_neg = [0 for n in range(len(neg_sent_embeddings))]

pos_df = pd.DataFrame(pos_sent_embeddings)
pos_df['class'] = y_pos

neg_df = pd.DataFrame(neg_sent_embeddings)
neg_df['class'] = y_neg

pos_df = pos_df.reset_index(drop=True)
neg_df = neg_df.reset_index(drop=True)

new_df = pd.concat([pos_df, neg_df]).reset_index(drop=True)

In [9]:
new_df['sentence'] = new_sent_df['Sentence']
new_df['translation'] = new_sent_df['Translation']

In [10]:
new_df = new_df.sample(frac=1).reset_index(drop=True)

In [11]:
y = new_df[["class"]]
X = new_df.drop(["class"], axis = 1)

In [15]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
acc_scores, f1_scores = [], []
i = 0
conf_scores = []

for train, test in skf.split(X, y): # Provides train/test indices to split data in train/test sets.
    clf = LogisticRegression(random_state=0, max_iter=1000).fit(X.drop(["sentence", "translation"], axis = 1).loc[train], y.loc[train].values.ravel())
    y_pred = clf.predict(X.drop(["sentence", "translation"], axis = 1).loc[test])
    
    df_skf = pd.DataFrame(X[['sentence', 'translation']].loc[test])
 
    df_skf['y_true'] = y.loc[test]
    df_skf['pred'] = y_pred
    df_skf.to_csv(f"../data/processed/classification_results/liver_is_enlarged_result_{i}.csv", index=False)
    
    acc = accuracy_score(y.loc[test], y_pred)
    f1 = f1_score(y.loc[test], y_pred)
    acc_scores.append(round(acc, 4))
    f1_scores.append(round(f1, 4))
    conf_scores.append(confusion_matrix(y.loc[test], y_pred))
    i += 1
print(f"confusion matrix score:\n{sum(conf_scores)}")

confusion matrix score:
[[226   4]
 [  7  13]]


In [13]:
print(f"Acc scores: {acc_scores}\nMean acc: {sum(acc_scores)/len(acc_scores):.4f}\n")
print(f"F1 scores: {f1_scores}\nMean f1: {sum(f1_scores)/len(f1_scores):.4f}\n")

Acc scores: [0.94, 0.94, 0.98, 0.94, 0.98]
Mean acc: 0.9560

F1 scores: [0.5714, 0.5714, 0.8571, 0.6667, 0.8571]
Mean f1: 0.7047

