In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import datetime
import json
from pprint import pprint
from IPython.core import display as ICD  # to print multiple nice pandas tables
from collections import defaultdict
import logging
import numpy as np
import io
import gensim
import tqdm

In [3]:
import datasets
import classify

In [4]:
def test_model(mode, dataset, tries=10):
    results = []
    for _ in tqdm.trange(tries):
        model.fit(dataset.samples, dataset.labels)
        results.append(model.score(dataset.samples, dataset.labels))
    print(np.min(results), np.mean(results), np.max(results), np.std(results))

In [61]:
from sklearn.linear_model import LogisticRegression
from gensim.models import TfidfModel
class SimpleModel(object):
    def __init__(self, cls = None, use_tfidf=False, w=None):
        self.cls = cls
        if self.cls is None:
            self.cls = LogisticRegression()
        self.use_tfidf = use_tfidf
        self.w=None
        

    def fit(self, X, Y):
        self.dictionary = gensim.corpora.Dictionary(X)
        self.num_terms = len(self.dictionary.dfs)
        bow = list(map(self.dictionary.doc2bow, X))
        if self.use_tfidf:
            self.tfidf_model = TfidfModel(bow)
            bow = self.tfidf_model[bow]
        if self.w is None:
            self.w = np.ones(self.num_terms)
        if True:
            bow = gensim.matutils.corpus2csc(bow)
            bow = gensim.matutils.Sparse2Corpus(bow.multiply(self.w.reshape(-1,1)))
        self.lsi = gensim.models.LsiModel(bow, id2word=self.dictionary)
        self.corpus = gensim.matutils.corpus2dense(self.lsi[bow], self.lsi.num_topics).T
        self.cls.fit(self.corpus, Y)
    
    def dw(self, X, Y):
        _ = self.predict(X)
        self.d_embedding = self.cls.dx(self.embedding, Y)
        print(self.d_embedding.shape)
        u = self.lsi.projection.u
        res = self.d_embedding.dot(u.T)
        dw = model.bow.multiply(res.T).sum(axis=1).T[0]
        return dw
        
    
    def predict(self, X):
        bow = list(map(self.dictionary.doc2bow, X))
        if self.use_tfidf:
            bow = self.tfidf_model[bow]
        if True:
            bow = gensim.matutils.corpus2csc(bow)
            self.bow = bow
            bow = bow.multiply(self.w.reshape(-1,1))
            bow = gensim.matutils.Sparse2Corpus(bow)

        self.embedding = gensim.matutils.corpus2dense(self.lsi[bow], self.lsi.num_topics).T
        Yhat = self.cls.predict(self.embedding)
        return Yhat
    
    def score(self, X, Y):
        Yhat = self.predict(X)
        return 1-((Yhat-Y)**2).mean()
    
    def update(self, X, Y):
        pass

In [62]:
dataset = datasets.CRDataset()
model = SimpleModel(classify.SkClassifier())
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.7849006622516557

In [63]:
dw = model.dw(dataset.samples, dataset.labels)

(3775, 200)


In [26]:
pom = model.bow.T.dot(model.lsi.projection.u)

In [29]:
u = model.lsi.projection.u

In [38]:
x = u.dot(u.T)
x.diagonal()

array([9.83579755e-01, 9.94838814e-01, 9.87574779e-01, ...,
       2.04783358e-04, 2.04783358e-04, 5.16126162e-04])

In [50]:
res = model.d_embedding.dot(u.T)
model.bow.multiply(res.T).sum(axis=1).T[0]

(3775, 5674)

In [57]:
model.bow.multiply(res.T).sum(axis=1).T[0]

matrix([[1.46199778e+00, 6.06734417e-01, 1.09502181e-01, ...,
         9.79000765e-04, 9.79000765e-04, 1.44550322e-03]])

In [28]:
pom.shape

(3775, 200)

In [13]:
sbow = model.bow[:10]
su = model.lsi.projection.u[:10]

In [24]:
print(model.bow.shape)
print(model.lsi.projection.u.shape)

(5674, 3775)
(5674, 200)


In [23]:
print(sbow.shape)
print(su.shape)

(10, 3775)
(10, 200)


In [25]:
sbow.T.dot(su).shape

(3775, 200)

In [8]:
model.lsi.projection.u.shape

(5674, 200)

In [9]:
model.corpus.shape

(3775, 200)

In [132]:
model.cls.dx(X, Y).shape

(3775, 200)

In [86]:
sample = [[0,1]]
model.lsi[sample]

[(0, 0.03467151321641605),
 (1, -0.008379483881744086),
 (2, -0.08778179090532488),
 (3, 0.0513274107869992),
 (4, 0.05593746875602374),
 (5, -0.007593093451914422),
 (6, 0.02499806230982492),
 (7, -0.008707268655324677),
 (8, -0.02296305911713987),
 (9, -0.040910024068387736),
 (10, -0.0007906049902375272),
 (11, -0.012902751337078093),
 (12, 0.07049760125802405),
 (13, 0.014653466054823332),
 (14, -0.037025247427074),
 (15, 0.09513512894365837),
 (16, 0.07250045490848336),
 (17, -0.19494116255486238),
 (18, 0.03976788514333459),
 (19, 0.40517573033666504),
 (20, -0.1101155226455703),
 (21, 0.34968312244303823),
 (22, -0.03631821014049906),
 (23, -0.6925449416478271),
 (24, 0.17989532789534615),
 (25, 0.12016773007369835),
 (26, 0.01172160822846742),
 (27, -0.026121067882449302),
 (28, 0.05123476298594439),
 (29, -0.05761857703257091),
 (30, -0.0006217311955445691),
 (31, 0.05033303085579515),
 (32, -0.09224715248457997),
 (33, 0.040129419519863034),
 (34, 0.04651275770944592),
 (35, 

In [87]:
bo = np.zeros(model.num_terms)
bo[0]=1.

In [133]:
p.u.shape

(5674, 200)

In [103]:
p = model.lsi.projection
np.dot(bo, p.u)

array([ 3.46715132e-02, -8.37948388e-03, -8.77817909e-02,  5.13274108e-02,
        5.59374688e-02, -7.59309345e-03,  2.49980623e-02, -8.70726866e-03,
       -2.29630591e-02, -4.09100241e-02, -7.90604990e-04, -1.29027513e-02,
        7.04976013e-02,  1.46534661e-02, -3.70252474e-02,  9.51351289e-02,
        7.25004549e-02, -1.94941163e-01,  3.97678851e-02,  4.05175730e-01,
       -1.10115523e-01,  3.49683122e-01, -3.63182101e-02, -6.92544942e-01,
        1.79895328e-01,  1.20167730e-01,  1.17216082e-02, -2.61210679e-02,
        5.12347630e-02, -5.76185770e-02, -6.21731196e-04,  5.03330309e-02,
       -9.22471525e-02,  4.01294195e-02,  4.65127577e-02, -8.64053745e-02,
       -1.50906643e-02, -1.85317483e-02,  3.81095485e-02, -2.72229378e-02,
        8.60475096e-02, -4.48799706e-03, -3.81005680e-02,  3.79228428e-02,
        7.22645900e-02, -6.26114957e-02, -2.94155045e-02,  8.31434206e-02,
        6.64245487e-03, -1.40075841e-02,  1.05852419e-02, -9.07165101e-03,
       -1.16620398e-02, -

In [68]:
dataset = datasets.CRDataset()
model = SimpleModel(classify.SkClassifier())
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.7904635761589404

In [60]:
dataset = datasets.CRDataset()
model = SimpleModel(classify.SkClassifier(), use_tfidf=True)
test_model(model, dataset)

100%|██████████| 10/10 [00:23<00:00,  2.33s/it]

0.7907284768211921 0.7958675496688743 0.7989403973509934 0.0020730063625445105





In [8]:
dataset = datasets.CRDataset()
model = SimpleModel(use_tfidf=True)
test_model(model, dataset)

100%|██████████| 10/10 [00:21<00:00,  2.11s/it]

0.7875496688741722 0.7944370860927152 0.799205298013245 0.0038842061718185775





In [9]:
dataset = datasets.CRDataset()
model = SimpleModel(use_tfidf=False)
test_model(model, dataset)

100%|██████████| 10/10 [00:19<00:00,  1.90s/it]

0.782251655629139 0.7866225165562913 0.7896688741721855 0.002686489775635024





In [10]:
dataset = datasets.CRDataset()
model = SimpleModel()
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.7819867549668874

In [11]:
dataset = datasets.MPQADataset()
model = SimpleModel()
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.7543843107674901

In [16]:
dataset = datasets.SUBJDataset()
model = SimpleModel()
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.8861

In [28]:
from scipy.stats import logistic
import numpy as np

class Classifier(object):
    def __init__(self, alpha=0.01):
        self.alpha = alpha

    def fit(self, X, Y, epoch=1000):
        self.w = np.random.random(X.shape[1])*0.1 - 0.05
        self.b = np.random.random()*0.1 - 0.05
        for e in range(epoch):
            self.update(X, Y)
    
    def update(self, X, Y):
        h = (X*self.w).sum(axis=1) + self.b
        Yhat = logistic.cdf(h)
        dif = Yhat-Y
        sigma = dif * Yhat * (1-Yhat)
        dw = (sigma.reshape(-1,1)* X).sum(axis=0)
        db = sigma.sum()
        self.b -= db
        self.w -= dw
        
    def gradient(X, Y):
        pass
    
    def predict(self, X):
        h = (X*self.w).sum(axis=1) + self.b
        Yhat = logistic.cdf(h)
        return (Yhat>0.5)+0

In [36]:
dataset = datasets.CRDataset()
model = SimpleModel()
test_model(model, dataset)

100%|██████████| 10/10 [00:19<00:00,  1.96s/it]

0.782251655629139 0.7879205298013245 0.7909933774834437 0.0028164310166891725





In [37]:
dataset = datasets.CRDataset()
model = SimpleModel(Classifier())
test_model(model, dataset)
#model.fit(dataset.samples, dataset.labels)
#model.score(dataset.samples, dataset.labels)

100%|██████████| 10/10 [00:49<00:00,  4.92s/it]

0.6376158940397352 0.6376158940397352 0.6376158940397352 0.0





In [11]:
n, d = 100, 5
X = np.random.random((n, d))-0.5
Y = ((X.sum(axis=1) + np.random.random(n)* - 0.5)>0)+0

cls = Classifier()
cls.fit(X, Y)

In [34]:
1-((cls.predict(X)-Y)**2).mean()

0.95