In [99]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from nltk.stem.snowball import SnowballStemmer
from imblearn.under_sampling import NearMiss


In [100]:
def give_me_a_frame(data_location):
    return pd.read_csv(
        filepath_or_buffer=data_location, 
        header=None, 
        sep='\n')

In [101]:
def preprocess(statements):
#     print(statement)
    s = set(stopwords.words('english'))
    stemmer = SnowballStemmer("english")
    table = str.maketrans('', '', string.punctuation)
    digit_table = str.maketrans('', '', string.digits)
    
    kal = []
    for statement in statements:
#         print(statement)
        statement = statement.lower()
        aaj = list(filter(lambda w: not w in s,statement.split()))
        stripped = [w.translate(table) for w in aaj]
#         stripped = [w.translate(digit_table) for w in stripped]
#         stripped = [d for d in stripped if len(d) >= 1]

        stripped_stemmer_fix = [stemmer.stem(stemmer_word) for stemmer_word in stripped]
        kal.append(" ".join(stripped_stemmer_fix))
#     print(stripped)
#     print(kal[:2])
    return kal

In [102]:
def build_matrix(train, test, n):
    vectorizer = TfidfVectorizer(norm='l2',ngram_range=(1,n))
    return vectorizer.fit_transform(train),vectorizer.transform(test)

In [103]:
def predict_output(train_matrix, test_vector, train_classes,  k=3 ):
    dot_product = test_vector.dot(train_matrix.T)
    sims = list(zip(dot_product.indices, dot_product.data))
    sims.sort(key=lambda x: x[1], reverse=True)
#     tc = Counter(train_classes[s[0]] for s in sims[:k]).most_common()
#     if len(tc) < 2 or tc[0][1] > tc[1][1]:
#             # majority vote
#             return tc[0][0]
#     print(train_classes[4863:4867])
    tc = defaultdict(float)
    for s in sims[:k]:
                tc[train_classes[s[0]]] += s[1]
    return sorted(tc.items(), key=lambda x: x[1], reverse=True)[0][0]

In [104]:
def splitData(mat, cls, fold=1, d=10):
    r""" Split the matrix and class info into train and test data using d-fold hold-out
    """
    n = mat.shape[0]
    r = int(np.ceil(n*1.0/d))
    print(r)
    mattr = []
    clstr = []
    # split mat and cls into d folds
    for f in range(d):
#         if f+1 != fold:
            mattr.append( mat[f*r: min((f+1)*r, n)] )
            clstr.extend( cls[f*r: min((f+1)*r, n)] )
#     print(mattr)
    # join all fold matrices that are not the test matrix
    train = sp.vstack(mattr)
    # extract the test matrix and class values associated with the test rows
#     test = mat[(fold-1)*r: min(fold*r, n), :]
#     clste = cls[(fold-1)*r: min(fold*r, n)]

    return train, clstr

In [105]:
def classify(statements, statements_test, classes, c, k):
    print("c = ", c)
    print("k = ", k)
    statements = preprocess(statements)
    statements_test = preprocess(statements_test)
    classes = np.array(classes)
    X_Train, X_test = build_matrix(statements,statements_test,c)
    
    X_train, X_test, y_train, y_test = train_test_split(X_Train, classes, test_size=0.33, random_state=21)
    y_pred = [predict_output(X_train,x,y_train,k) for x in X_test]
    print(precision_recall_fscore_support(y_test, y_pred, average='macro'))    
    

In [106]:
df = give_me_a_frame('data/train.dat')   
df = pd.DataFrame(df[0].str.split('\t', 1).tolist())
statements = df[1]
classes = df[0]
print(statements[0])

# n, bins, patches = plt.hist(classes)
# plt.show()

df = give_me_a_frame('data/test.dat')   
statements_test = df[0]
# print(statements_test)
# df = pd.DataFrame({'aaj':statements})

# df[df['aaj'].str.contains('acute')]


Catheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries. The groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%). Cardiogenic shock was present in eight patients with infarction of the left anterior descend

In [107]:
statements = preprocess(statements)
statements_test = preprocess(statements_test)
classes = np.array(classes)
# df = pd.DataFrame({'aaj':statements})

print(statements[:1])
# df[df['aaj'].str.contains('acut')]


# X_Train, X_test = build_matrix(statements,statements_test,2)

['catheter laboratori event hospit outcom direct angioplasti acut myocardi infarct assess safeti direct infarct angioplasti without anteced thrombolyt therapi catheter laboratori hospit event assess consecut treat patient infarct involv left anterior descend n patient right n circumflex n coronari arteri group patient similar age left anterior descend coronari arteri year right coronari arteri year circumflex coronari arteri year patient multivessel diseas left anterior descend coronari arteri right coronari arteri circumflex coronari arteri patient initi grade antegrad flow left anterior descend coronari arteri right coronari arteri circumflex coronari arteri cardiogen shock present eight patient infarct left anterior descend coronari arteri four infarct right coronari arteri four infarct circumflex coronari arteri major catheter laboratori event cardiovers cardiopulmonari resuscit dopamin intraaort balloon pump support hypotens urgent surgeri occur patient infarct left anterior desce

In [33]:
# nm = NearMiss(version=2,random_state=42)
# X_res, y_res = nm.fit_sample(X_Train, classes)
# labels, values = zip(*Counter(y_res).most_common())
# indexes = np.arange(len(labels))
# width = 1
# plt.bar(indexes, values, width)
# plt.xticks(indexes + width * 0.5, labels)
# plt.show()

In [108]:
X_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=21)

# # X_train.append(x_test)
X_train = sp.vstack([X_train,x_test])
# # y_train.extend(y_test)
y_train = np.append(y_train,y_test)
# # y_pred = [predict_output(X_train,x,y_train,8) for x in X_test]
# print(X_train.shape[0])
# print(y_train.shape[0])

with open('out.dat', 'w') as f:
    for i in range(X_test.shape[0]):
        f.write("%s\n" % (predict_output(X_train, X_test[i], y_train, 69)))
        
# clspr = [predict_output(X_Train, x_test, classes, 5) for x_test in X_test]
# print(clspr)
# aaj = cosine_similarity(X_test[:1],X_Train).flatten()
# kal = X_test[:1].dot(X_Train.T)
# similar_docs = aaj.argsort()[:-6:-1]
# statements = np.array(statements)
# classes = np.array(classes)
# print(similar_docs)
# print(classes[similar_docs])
# print(kal)

In [97]:
# for c in range(2,3):
#     for k in range(68,73):
#         classify(statements, statements_test, classes, c, k)

c =  4
k =  68
(0.5300770841030762, 0.5193884810597736, 0.5201577009578067, None)
c =  4
k =  69


KeyboardInterrupt: 