In [1]:
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

%matplotlib inline

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [3]:
sentences = pd.read_csv('../data/processed/temple_radio_1_2_sentences_with_translation.csv')
sent_embeddings = genfromtxt('../data/processed/temple_radio_1_2_sentence_embeddings.csv', delimiter=',')

In [4]:
pos_sent = sentences[sentences['Translation'] == 'The liver shows bumpy/nodular surface, which can be seen with cirrhosis. ']

In [5]:
pos_sent_embeddings = sent_embeddings[pos_sent.index, :]
neg_sent = sentences.drop(pos_sent.index)
neg_sent_embeddings = sent_embeddings[neg_sent.index, :]

In [6]:
neg_sent_embeddings = sent_embeddings[neg_sent.index, :]

In [7]:
pos_sent = pos_sent.reset_index(drop=True)
neg_sent = neg_sent.reset_index(drop=True)

In [8]:
new_sent_df = pd.concat([pos_sent, neg_sent]).reset_index(drop=True)

In [9]:
y_pos = [1 for p in range(len(pos_sent_embeddings))]
y_neg = [0 for n in range(len(neg_sent_embeddings))]

pos_df = pd.DataFrame(pos_sent_embeddings)
pos_df['class'] = y_pos

neg_df = pd.DataFrame(neg_sent_embeddings)
neg_df['class'] = y_neg

pos_df = pos_df.reset_index(drop=True)
neg_df = neg_df.reset_index(drop=True)

new_df = pd.concat([pos_df, neg_df]).reset_index(drop=True)

In [10]:
new_df['sentence'] = new_sent_df['Sentence']
new_df['translation'] = new_sent_df['Translation']

In [11]:
new_df = new_df.sample(frac=1).reset_index(drop=True)

In [12]:
y = new_df[["class"]]
X = new_df.drop(["class"], axis = 1)

In [13]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
acc_scores, f1_scores = [], []
i = 0
conf_scores = []

for train, test in skf.split(X, y): # Provides train/test indices to split data in train/test sets.
    clf = LogisticRegression(random_state=0, max_iter=1000).fit(X.drop(["sentence", "translation"], axis = 1).loc[train], y.loc[train].values.ravel())
    y_pred = clf.predict(X.drop(["sentence", "translation"], axis = 1).loc[test])
    
    df_skf = pd.DataFrame(X[['sentence', 'translation']].loc[test])
 
    df_skf['y_true'] = y.loc[test]
    df_skf['pred'] = y_pred
    df_skf.to_csv(f"../data/processed/classification_results/third_result_{i}.csv", index=False)
    
    acc = accuracy_score(y.loc[test], y_pred)
    f1 = f1_score(y.loc[test], y_pred)
    acc_scores.append(round(acc, 4))
    f1_scores.append(round(f1, 4))
    conf_scores.append(confusion_matrix(y.loc[test], y_pred))
    i += 1
print(f"confusion matrix score:\n{sum(conf_scores)}")

confusion matrix score:
[[236   0]
 [  2  12]]


In [14]:
print(f"Acc scores: {acc_scores}\nMean acc: {sum(acc_scores)/len(acc_scores):.4f}\n")
print(f"F1 scores: {f1_scores}\nMean f1: {sum(f1_scores)/len(f1_scores):.4f}\n")

Acc scores: [1.0, 1.0, 1.0, 0.98, 0.98]
Mean acc: 0.9920

F1 scores: [1.0, 1.0, 1.0, 0.8, 0.8]
Mean f1: 0.9200



In [17]:
df = pd.read_csv("../data/processed/classification_results/third_result_1.csv")

In [18]:
df

Unnamed: 0,sentence,translation,y_true,pred
0,"Few stable less than 5 mm hypodensities, too, ...",There are one or more lesions in the liver tha...,0,0
1,"Stable 1 cm laceration at the, posterior aspec...",There is an injury to the liver.,0,0
2,Hepatic veins patent.,empty,0,0
3,No discrete lesions identified within the liver.,empty,0,0
4,Normal in size and contour with scattered hepa...,"There are one or more lesions in the liver, so...",0,0
5,"Nodular hepatic surface contour reflecting, ch...","The liver shows bumpy/nodular surface, which c...",1,1
6,There is a hepatic lesion in the caudate lobe ...,There is a benign mass in the liver which does...,0,0
7,"No other focal, parenchymal abnormality.",empty,0,0
8,"Features of left hepatectomy noted., The right...",There are post surgical changes of the liver.,0,0
9,Normal size and contour.,empty,0,0
