In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import datetime
import json
from pprint import pprint
from IPython.core import display as ICD  # to print multiple nice pandas tables
from collections import defaultdict
import logging
import numpy as np
import io
import gensim
import tqdm

In [19]:
import datasets
import classify

In [20]:
from sklearn.linear_model import LogisticRegression
from gensim.models import TfidfModel
class SimpleModel(object):
    def __init__(self, cls = None, use_tfidf=False, w=None):
        self.cls = cls
        if self.cls is None:
            self.cls = LogisticRegression()
        self.use_tfidf = use_tfidf
        self.w=None
        

    def fit(self, X, Y):
        self.dictionary = gensim.corpora.Dictionary(X)
        self.num_terms = len(self.dictionary.dfs)
        bow = list(map(self.dictionary.doc2bow, X))
        if self.use_tfidf:
            self.tfidf_model = TfidfModel(bow)
            bow = self.tfidf_model[bow]
        if self.w is None:
            self.w = np.ones(self.num_terms)
        if True:
            corpus = gensim.matutils.corpus2csc(bow)
            corpus = corpus.multiply(self.w.reshape(-1,1))
        self.lsi = gensim.models.LsiModel(bow, id2word=self.dictionary)
        corpus = gensim.matutils.corpus2dense(self.lsi[bow], self.lsi.num_topics).T
        self.cls.fit(corpus, Y)
    
    def backprob_w(self, X, Y):
        pass
    
    def predict(self, X):
        bow = list(map(self.dictionary.doc2bow, X))
        if self.use_tfidf:
            bow = self.tfidf_model[bow]
        corpus = gensim.matutils.corpus2dense(self.lsi[bow], self.lsi.num_topics).T
        Yhat = self.cls.predict(corpus)
        return Yhat
    
    def score(self, X, Y):
        Yhat = self.predict(X)
        return 1-((Yhat-Y)**2).mean()
    
    def update(self, X, Y):
        pass

In [6]:
dataset = datasets.CRDataset()
model = SimpleModel(Classifier())
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.6376158940397352

In [7]:
def test_model(mode, dataset, tries=10):
    results = []
    for _ in tqdm.trange(tries):
        model.fit(dataset.samples, dataset.labels)
        results.append(model.score(dataset.samples, dataset.labels))
    print(np.min(results), np.mean(results), np.max(results), np.std(results))

In [8]:
dataset = datasets.CRDataset()
model = SimpleModel(use_tfidf=True)
test_model(model, dataset)

100%|██████████| 10/10 [00:21<00:00,  2.11s/it]

0.7875496688741722 0.7944370860927152 0.799205298013245 0.0038842061718185775





In [9]:
dataset = datasets.CRDataset()
model = SimpleModel(use_tfidf=False)
test_model(model, dataset)

100%|██████████| 10/10 [00:19<00:00,  1.90s/it]

0.782251655629139 0.7866225165562913 0.7896688741721855 0.002686489775635024





In [10]:
dataset = datasets.CRDataset()
model = SimpleModel()
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.7819867549668874

In [11]:
dataset = datasets.MPQADataset()
model = SimpleModel()
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.7543843107674901

In [16]:
dataset = datasets.SUBJDataset()
model = SimpleModel()
model.fit(dataset.samples, dataset.labels)
model.score(dataset.samples, dataset.labels)

0.8861

In [28]:
from scipy.stats import logistic
import numpy as np

class Classifier(object):
    def __init__(self, alpha=0.01):
        self.alpha = alpha

    def fit(self, X, Y, epoch=1000):
        self.w = np.random.random(X.shape[1])*0.1 - 0.05
        self.b = np.random.random()*0.1 - 0.05
        for e in range(epoch):
            self.update(X, Y)
    
    def update(self, X, Y):
        h = (X*self.w).sum(axis=1) + self.b
        Yhat = logistic.cdf(h)
        dif = Yhat-Y
        sigma = dif * Yhat * (1-Yhat)
        dw = (sigma.reshape(-1,1)* X).sum(axis=0)
        db = sigma.sum()
        self.b -= db
        self.w -= dw
        
    def gradient(X, Y):
        pass
    
    def predict(self, X):
        h = (X*self.w).sum(axis=1) + self.b
        Yhat = logistic.cdf(h)
        return (Yhat>0.5)+0

In [36]:
dataset = datasets.CRDataset()
model = SimpleModel()
test_model(model, dataset)

100%|██████████| 10/10 [00:19<00:00,  1.96s/it]

0.782251655629139 0.7879205298013245 0.7909933774834437 0.0028164310166891725





In [37]:
dataset = datasets.CRDataset()
model = SimpleModel(Classifier())
test_model(model, dataset)
#model.fit(dataset.samples, dataset.labels)
#model.score(dataset.samples, dataset.labels)

100%|██████████| 10/10 [00:49<00:00,  4.92s/it]

0.6376158940397352 0.6376158940397352 0.6376158940397352 0.0





In [11]:
n, d = 100, 5
X = np.random.random((n, d))-0.5
Y = ((X.sum(axis=1) + np.random.random(n)* - 0.5)>0)+0

cls = Classifier()
cls.fit(X, Y)

In [34]:
1-((cls.predict(X)-Y)**2).mean()

0.95