In [116]:
import numpy as np
import cv2 as cv
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing
from utils import *

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB

from gensim.test.utils import common_corpus, common_dictionary
from gensim.corpora import Dictionary
from gensim.models import HdpModel
from gensim import matutils

In [30]:
bp = "/home/hung/ws/data1/raf-db"
df = pd.read_csv("{}/data_aligned.csv".format(bp))
df_train = df.loc[df['name'].str.contains('train_')]
df_test = df.loc[df['name'].str.contains('test_')]
fname_train = ["{}/{}".format(bp, o) for o in df_train['name']]
fname_test = ["{}/{}".format(bp, o) for o in df_test['name']]
labels_train = [o for o in df_train['label']]
labels_test = [o for o in df_test['label']]

In [31]:
vocab = Vocab('suft.100.train.norml2.npy', 'sift.100.train.norml2.npy', 'kaze.100.train.norml2.npy')

In [62]:
def build_docs(fname_list, labels):
    docs, targets = [], []
    for fname, label in zip(fname_list, labels):
        try:
            out = list(vocab.query_id(fname))
            ids = [o for _,o in out]
            docs.append(ids)
            targets.append(label)
        except: pass # some error file
    return docs, np.array(targets)

def ff(fname_train, labels_train):
    train_docs, train_targets = build_docs(fname_train, labels_train)
    train_docs_str = [' '.join([str(a) for a in o]) for o in train_docs]

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(train_docs_str)  
    
    tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    return X_train_tf, train_targets

def pre_processing():
    # train
    X_train_tf, train_targets = ff(fname_train, labels_train)
    # test
    X_test_tf, test_targets = ff(fname_test, labels_test)
    return X_train_tf, train_targets, X_test_tf, test_targets

In [12]:
# vectorizer = TfidfVectorizer(min_df= 3, sublinear_tf=True, norm='l2', ngram_range=(1, 2))
# final_features = vectorizer.fit_transform(out1).toarray()
# final_features.shape

(12223, 31276)

In [63]:
X_train_tf, train_targets, X_test_tf, test_targets = pre_processing()

### NB

In [64]:
clf = MultinomialNB().fit(X_train_tf, train_targets)

In [66]:
predicted = clf.predict(X_test_tf)

In [67]:
np.mean(predicted == test_targets)  

0.4181639986932375

### SVM

In [130]:
clf1 = sklearn.svm.SVC(gamma='scale', coef0=4).fit(X_train_tf, train_targets)

In [131]:
predicted = clf1.predict(X_test_tf)

In [132]:
np.mean(predicted == test_targets)  

0.4900359359686377

## HDP

In [139]:
def get_vec_f(fname, labels, dct=None, hdp=None):
    docs, targets = build_docs(fname, labels)
    docs = [[str(o) for o in one] for one in docs]
    
    if dct is None: # train set
        dct = Dictionary(train_docs)
        for one in docs:
            dct.add_documents([[str(o) for o in one]])
            
    copus = [dct.doc2bow(o) for o in docs]
    if hdp is None: # train
        hdp = HdpModel(copus, dct)
        
    v = [hdp[o] for o in copus]
    v_d = matutils.corpus2dense(v, num_terms=len(dct.token2id)).T
    
    return copus, v_d, targets, dct, hdp

In [73]:
train_docs, train_targets = build_docs(fname_train, labels_train)

In [74]:
test_docs, test_targets = build_docs(fname_test, labels_test)

In [77]:
train_docs = [[str(o) for o in one] for one in train_docs]
test_docs = [[str(o) for o in one] for one in test_docs]

In [75]:
dct = Dictionary(train_docs)

In [76]:
for one in train_docs:
    dct.add_documents([[str(o) for o in one]])

In [78]:
copus = [dct.doc2bow(o) for o in train_docs]

In [80]:
hdp = HdpModel(copus, dct)

In [99]:
len(hdp.get_topics())

150

In [82]:
hdp[copus[1]]

[(8, 0.7347021583895617), (46, 0.1441341423208861), (79, 0.11680245565832961)]

In [83]:
vectors = [hdp[o] for o in copus]

In [91]:
len(vectors), len(train_targets)

(12223, 12223)

In [95]:
vectors[:3]

[[(3, 0.46183658830517704),
  (26, 0.10384062750052449),
  (89, 0.06240996670190984),
  (128, 0.1699017207216343),
  (136, 0.0854548720815281),
  (138, 0.08636650952394877),
  (148, 0.02410010896361052)],
 [(8, 0.7347021583895617),
  (46, 0.1441341423208861),
  (79, 0.11680245565832961)],
 [(1, 0.5838619553442171),
  (46, 0.05318602792629546),
  (99, 0.2291244361167196),
  (101, 0.03952411922636785),
  (111, 0.08837481488811819)]]

In [92]:
tf_sparse_array = matutils.corpus2csc(vectors)

In [104]:
a = matutils.corpus2dense(vectors, num_terms=len(dct.token2id))
a.T.shape

(12223, 300)

In [107]:
# testing
test_data = [dct.doc2bow(o) for o in test_docs]

In [108]:
hdp[test_data[0]]

[(0, 0.6753637283283193), (10, 0.3200522183522646)]

In [109]:
test_vectors = [hdp[o] for o in test_data]

In [110]:
testdense = matutils.corpus2dense(test_vectors, num_terms=len(dct.token2id)).T

In [140]:
train_copus, train_v, train_targets, dct, hdp = get_vec_f(fname_train, labels_train)

In [141]:
test_copus, test_v, test_targets, dct, hdp = get_vec_f(fname_test, labels_test, dct, hdp)

In [None]:
clf2 = MultinomialNB().fit(train_v, train_targets)
predicted = clf2.predict(test_v)
np.mean(predicted == test_targets) 

In [143]:
clf2 = sklearn.svm.SVC(gamma='scale', coef0=4).fit(train_v, train_targets)
predicted = clf2.predict(test_v)
np.mean(predicted == test_targets) 

0.3982358706305129

### result for NB

In [106]:
clf = MultinomialNB().fit(a.T, train_targets)

In [111]:
t_predict = clf.predict(testdense)

In [112]:
t_predict

array([4, 4, 4, ..., 4, 4, 4])

In [113]:
np.mean(t_predict == test_targets)  

0.4024828487422411

### result for SVM

In [133]:
clf1 = sklearn.svm.SVC(gamma='scale', coef0=4).fit(a.T, train_targets)

In [134]:
t_predict = clf1.predict(testdense)

In [135]:
np.mean(t_predict == test_targets)

0.4122835674616139