# Allen Institute
## 01_preprocess
---
## Before you start
- run `make init` cmd in your local env

In [3]:
import glob
import json
import os
from typing import List, NamedTuple

import numpy as np
import pandas as pd
import scipy.sparse as sp


# from tools.preprocess import fmt_rpm, fmt_table, find

In [394]:
import pickle

import numpy as np
import pandas as pd
import scipy.sparse as sp


class SparseDF():
    def __init__(
        self,
        df: pd.core.frame.DataFrame
    ):
        self.values = df.values
        self.index = df.index
        self.columns = df.columns
        self.shape = self.values.shape
        
    def __call__(self):
        return sp.csc_matrix(self.values)
    
    def __len__(self):
        return self.values.__len__()
    
    def __add__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__add__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __sub__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__sub__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __mul__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__mul__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __matmul__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__matmul__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __pow__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__pow__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __truediv__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__truediv__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __floordiv__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__floordiv__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __mod__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__mod__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __pos__(self):
        return SparseDF(
            pd.DataFrame(
                self.values.__pos__(), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __neg__(self):
        return SparseDF(
            pd.DataFrame(
                self.values.__neg__(), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __eq__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__eq__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __lt__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__lt__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __le__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__le__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __ne__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__ne__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __ge__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__ge__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __gt__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__gt__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __cmp__(self, other):
        return SparseDF(
            pd.DataFrame(
                self.values.__cmp__(other), 
                index=self.index,
                columns=self.columns
            )
        )
    
    def __contains__(self, other):
        return self.values.__contains__(other)
    
    def to_df(self):
        return pd.DataFrame(
            self.values,
            index=self.index,
            columns=self.columns
        )
    
    def __getitem__(self, item):
        return self.to_df()[item] if isinstance(item, str) else self.values[item]
    
    def to_pickle(self, filename: str):
        with open(filename, "wb") as f:
            pickle.dump(self, f)
    
    def save_npz(self, filename):
        sp.save_npz(filename, self())
    
    def to_csv(self, filename, index: bool = True):
        self.to_df().to_csv(filename, index=index)
        
    def colidx2json(self, filename):
        with open(filename, "w") as f:
            d = dict(
                index=self.index.to_list(),
                columns=self.columns.to_list()
            )
            json.dump(d, f)

In [395]:
temp = pd.DataFrame(
    np.random.randn(10, 10).reshape(10, 10),
    index=[f"idx{i}" for i in range(10)],
    columns=[f"col{i}" for i in range(10)]
)

In [396]:
temp.__sizeof__()

1410

In [397]:
hoge = SparseDF(temp)

In [398]:
hoge.__sizeof__()

32

In [399]:
moge = np.arange(10)[:-1] = 22

In [400]:
moge

22

In [401]:
np.arange(4, 100).attrgetter()

AttributeError: 'numpy.ndarray' object has no attribute 'attrgetter'

In [402]:
np.inf in np.arange(10).reshape(2, 5)

False

In [403]:
-(hoge / 3).__sizeof__()

-32

In [404]:
np.arange(10) / 3

array([0.        , 0.33333333, 0.66666667, 1.        , 1.33333333,
       1.66666667, 2.        , 2.33333333, 2.66666667, 3.        ])

In [405]:
with open("./temp.json", "w") as f:
    d = dict(index=hoge.index.to_list(), columns=hoge.columns.to_list())
    
    json.dump(d, f)

In [406]:
hoge.to_pickle("./temp.pkl")

In [407]:
hoge.save_npz("./temp.npz")

In [408]:
hoge.to_csv("./temp.csv")

In [409]:
hoge.colidx2json("./temp.json")

In [413]:
type(hoge.to_df().columns)

pandas.core.indexes.base.Index

In [412]:
pd.core.indexes.base.Index

pandas.core.indexes.base.Index

In [414]:
type(hoge())

scipy.sparse._csc.csc_matrix

In [415]:
sp._csc.csc_matrix

scipy.sparse._csc.csc_matrix

In [417]:
["a", "b"].index("a")

0

In [420]:
hoge.to_df().columns.to_list().index("col2")

2