In [92]:
import pandas as pd
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') # This comment is from MSCA

%reload_ext autoreload
%autoreload 1

pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

import os, glob
import pandas as pd
import numpy as np

## Lists

In [95]:
a = list(range(0,10))
print(a)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [96]:
b = a + 2

TypeError: can only concatenate list (not "int") to list

In [13]:
b = a * 2
print(b)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [27]:
b = [x+2 for x in a]

# Put print loop in function
print("a --> b")
for x, y in zip(a, b):
    print(x, "-->", y)

a --> b
0 --> 2
1 --> 3
2 --> 4
3 --> 5
4 --> 6
5 --> 7
6 --> 8
7 --> 9
8 --> 10
9 --> 11


##### mapping a lambda function - generally slower

In [28]:
b = list(map(lambda x: x+2, a))
print(b)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


## Generator expressions

Similar to list comprehensions, but they return "iterables" as opposed to the full list. 

This saves memory because the new values are not created in memory, rather in run time as and when required. 

In [86]:
b = [x+2 for x in a]
print(b)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [88]:
b_expr = (x+3 for x in a)
print(b_expr)

<generator object <genexpr> at 0x000002B179493B48>


In [89]:
b.extend(b_expr)
b

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

## Dictionaries

Make a dictionary of key-value pairs such that the "Key" is all the even numbers between 10-20 and the "value" is the cube of the corresponding key

In [37]:
d = dict({})

for i in range(0,10,2):
    d[i] = i**3
print(d)

{0: 0, 2: 8, 4: 64, 6: 216, 8: 512}


In [45]:
d = {x:x**3 for x in range(0,10,2)}
print(d)

{0: 0, 2: 8, 4: 64, 6: 216, 8: 512}


In [46]:
d = {x:x**3 for x in range(0,10,1) if x%2 == 0}
print(d)

{0: 0, 2: 8, 4: 64, 6: 216, 8: 512}


#### Nested dictionary comprehension

In [91]:
# nested_dict = {'first':{'a':1}, 'second':{'b':2}}
# float_dict = {outer_k: {float(inner_v) for (inner_k, inner_v) in outer_v.items()} for (outer_k, outer_v) in nested_dict.items()}
# print(float_dict)

{'first': {1.0}, 'second': {2.0}}


## Recursions

### Fibonacci

 Fn = Fn-1 + Fn-2

In [90]:
def fibonacci(num): 
    if num < 0: 
        print("Cannot be less than zero") 
        return -1
    elif num==1: 
        # First Fibonacci number is 0 
        return 0
    elif num==2: 
        # Second Fibonacci number is 1 
        return 1
    else: 
        return fibonacci(num-1) + fibonacci(num-2) 
    
print(fibonacci(5))

3


---------

# Functions, classes and other objects

In [98]:
import pandas as pd

In [101]:
class FileUtils:    
    
    def __init__(self):
        print("Constructor for the class")
        
        
    def read_csv(self, filepath, 
                 index_colname=None, 
                 nrows_to_read=None, skip_reading_rows=None):
        try:
            df = pd.read_csv(filepath_or_buffer=filepath, 
                             index_col=index_colname, 
                             nrows=nrows_to_read, 
                             skiprows=skip_reading_rows)
        except Exception as e:
            print("Exception thrown while reading CSV file/buffer :", e)
            return pd.DataFrame({})
        return df
        
        
    def write_csv(self, filepath, 
                  df, write_index_col=True):
        try:
            df.to_csv(filepath, index=write_index_col)
        except Exception as e:
            print(" Exception thrown while writing CSV file/buffer :", e)
            return -1
        return 1
    
    
    def save_obj_to_disk(self, obj, filename):
        try:
            with open(filename, 'wb') as handle:
                pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print(e)


    def load_obj_from_disk(self, filename):
        try:
            with open(filename, 'rb') as handle:
                b = pickle.load(handle)        
                return b
        except Exception as e:
            print(e)    

        

In [None]:
class DataFrameUtils:
    
    def __init__(self):
        self._labelencoders = None
        self._onehotencoders = None
        self._scalers = None
        pass
    
    # ----------------------------------------------------------------------------------------------------------------------
    # ENCODING UTILS
    # ----------------------------------------------------------------------------------------------------------------------

    def get_label_encoded(df, colname, inplace=True):
        """
        Returns label encoded column appended to the data frame.
        New column is pre-pended with "le_" followed by @colname
        :param df: data frame
        :param colname: name of the column to encode
        :param inplace: if True, replaces the original columns instead of making new ones
        :return: updated dataframe
        """

        # Sanity check
        if colname not in df.columns:
            raise ValueError("Column not in Dataframe!")
            return df

        le = LabelEncoder()
        le.fit(df[colname])
        le_colname = colname
        if not inplace:
            le_colname = "le_" + le_colname
        df[le_colname] = le.transform(df[colname])
        return df, le


    def labelencode_collist(df, collist, inplace=True):
        """
        Returns label encoded columns appended to the data frame.
        New columns are pre-pended with "le_" followed by @colname
        :param df: data frame
        :param collist: list with names of the columns to encode
        :param inplace: if True, replaces the original columns instead of making new ones
        :return: updated dataframe and dict of colname:encoder
        """

        self._labelencoders= {}

        for col in collist:
            if col not in df.columns:
                continue
            df, le = get_label_encoded(df, col, inplace)
    #         encoder_list[col] = le
            self._labelencoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

        return df


    def get_onehot_encoded(df, colname, drop_original=True):
        """
        Returns One Hot Encoded columns appended to the data frame.
        New columns are pre-pended with @colname followed by encoded class label
        :param df: data frame
        :param colname: name of the column to encode
        :param drop_original: if True, drops original column
        :return: updated dataframe 
        """

        # Sanity check
        if colname not in df.columns:
            raise ValueError("Column not in Dataframe!")
            return data

        ohe = OneHotEncoder(categorical_features=[0], handle_unknown="ignore")
        out = ohe.fit_transform(df[colname].values.reshape(-1,1)).toarray()
        # Drop the first column - dummy variable trap
        out = out[:,1:]
        # Join with the original data frame
        dfOneHot = pd.DataFrame(out, 
                                columns=[colname+"_"+str(int(i)) for i in range(out.shape[1])], 
                                index=df.index)
        df = pd.concat([df, dfOneHot], axis=1)

        if drop_original:
            df.drop(colname, axis=1, inplace=True)

        return df, ohe


    def onehotencode_collist(df, collist, drop_original=True):
        """
        Returns One Hot Encoded columns appended to the data frame.
        New columns are pre-pended with @colname followed by encoded class label
        :param df: data frame
        :param collist: list with names of the columns to encode
        :param drop_original: if True, drops original column
        :return: updated dataframe and dict of colname:encoder
        """

        self._onehotencoders= {}

        for col in collist:
            if col not in df.columns:
                continue
            print(col)
            df, ohe = get_onehot_encoded(df, col, drop_original)
            self._onehotencoders[col] = ohe

        return df



    # ----------------------------------------------------------------------------------------------------------------------
    # SCALING UTILS
    # ----------------------------------------------------------------------------------------------------------------------

    def scale_collist(df, collist):
        """
        Returns One Hot Encoded columns appended to the data frame.
        New columns are pre-pended with @colname followed by encoded class label
        :param df: data frame
        :param collist: list with names of the columns to encode
        :param drop_original: if True, drops original column
        :return: updated dataframe and dict of colname:encoder
        """

        self._scalers = {}

        for col in collist:
            if col not in df.columns:
                continue

            scaler = StandardScaler()
            df[col] = scaler.fit_transform(df[col].values.reshape(-1,1))
            self._scalers[col] = scaler

        return df, self._scalers
    