# Sheet 3 - Sergio Amortegui, Juan David Forero, Navid Esteban Mejia

## Imports

In [2]:
import numpy as np
import pandas as pd
import numbers
import itertools

## Exercise 2
Use the credits.csv for all the tasks in this exercise (iris datasets
only for some unit tests). You might stumble over some difficulties and may want to check
the replace and fillna function of Pandas data frames to get rid of missing values. Do not
use the get_dummies, digitize, cut, crosstab (or any similar) function in this exercise.

### 1 

#### a)
Write a function binarizeCategoricalAttributeVector(column) that takes a
categorical attribute vector (do not assume a categorical Pandas type) and re-
turns an n ×m dimensional numpy array, where m is the number of categorical
values occurring in the column and n is the length of the original column.

In [None]:
def binarizeCategoricalAttributeVector(column):
    n = len(column)
    m = set(column)
    itemIndex = {}
    count = 0

    for item in m:
      itemIndex[item] = count
      count += 1

    arr = np.zeros((n,len(m)), dtype=int)
    for index, item in enumerate(column):
      arr[index][itemIndex[item]] = 1

    return arr


#### b)
Then write a function getCategoricalAttributes(df) that returns a list of column names of a Pandas DataFrame that contain non-numeric values.

In [None]:
def getCategoricalAttributes(df):
  return(df.select_dtypes(exclude=['int64','float64']).columns)

#### c)
Finally, write a function readFrameAsMatrix(df) to convert a given DataFrame
into a purely numeric nd array such that each categorical attribute with m values
is converted into m binary attributes (columns).

In [None]:
def readFrameAsMatrix(df):
  # use getCategoricalAttributes and binarize to complete de point
  return df.to_numpy()

#### Tests

In [None]:
def check_column_conversion(column):
    M = binarizeCategoricalAttributeVector(column)
    vals = list(np.unique(M))
    sorted(vals)
    print("-----------------\nBinarization check\n-----------------")
    print("Dimension check: " + ("OK" if M.shape == (len(column), len(np.unique(column))) else "FAIL"))
    print("Occurring values: " + ("OK" if vals == [0, 1] else "FAIL (there should only be 0s and 1s in the output.)"))
    print("Coherence: " + ("OK" if all(np.sum(M, axis=1) == np.ones(len(column))) else "FAIL (all rows must sum up to 1)"))

def check_category_detection(df, expectedcols):
    print("-----------------\nCheck of category detection\n-----------------")
    act = getCategoricalAttributes(df)
    missing = [c for c in expectedcols if not c in act]
    unexpected = [c for c in act if not c in expectedcols]
    print("Categorical attribute detection: " + ("OK" if len(missing) + len(unexpected) == 0 else "FAIL (undetected columns: " + str(missing) + ", wrongly detected columns: " + str(unexpected) + ")"))
    
def check_frame_conversion(df, num_expected_columns):
    print("-----------------\nConversion check for data frames\n-----------------")
    A = readFrameAsMatrix(df)
    print("Outer Type check: " + ("OK" if type(A) == np.ndarray else "FAIL (not a numpy array but " + str(type(A)) + ")"))
    print("Inner Type check: " + ("OK" if A.dtype in [float, np.float32, np.float64] else "FAIL (dtype of matrix should be something numeric like float and not " + str(A.dtype) + ")"))
    print("Dimensionality check: " + ("OK" if len(A) == len(df) and A.shape[1] == num_expected_columns else "FAIL (expected shape " + str(len(df)) + " x " + str(num_expected_columns) + ", but observed shape " + str(len(A)) + " x " + str(A.shape[1]) + ")"))


## unit test for conversion functions
dfCreditTest = pd.read_csv("credits.csv")
check_column_conversion(dfCreditTest.values[:,1])
check_category_detection(dfCreditTest, ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'class'])
check_frame_conversion(dfCreditTest, 110)

### 2

In [None]:
def test_discretization(column, thresholds, names, expected):
    conv = discretizeBasedOnThresholds(column, thresholds, names)
    print("Conversion test: " + ("OK" if len(conv) == len(expected) and all(conv == expected) else "FAIL (expected \"" + str(expected) +"\" but observed \"" + str(conv) + "\")"))
    
def test_equal_length_discretization(arr, k, expected):
    act = discretizeEqualLength(arr, k)
    print ("Equal Length Discretization: " + ("OK" if all(act == expected) else "FAIL"))
    
def test_equal_count_discretization(arr, k, expected):
    act = discretizeEqualFrequency(arr, k)
    print ("Equal Count Discretization: " + ("OK" if all(act == expected) else "FAIL"))

# reproduce results from the lecture
dfIrisTest = pd.read_csv("iris.csv")
test_discretization(dfIris.values[:,0], [5.2, 6.1, 7], ["very short", "short", "long", "very long"], ["very short", "very short", "very short", "very short", "very short", "short", "very short", "very short", "very short", "very short", "short", "very short", "very short", "very short", "short", "short", "short", "very short", "short", "very short", "short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "short", "very short", "short", "very short", "very short", "short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "short", "very short", "long", "long", "long", "short", "long", "short", "long", "very short", "long", "very short", "very short", "short", "short", "short", "short", "long", "short", "short", "long", "short", "short", "short", "long", "short", "long", "long", "long", "long", "short", "short", "short", "short", "short", "short", "short", "short", "long", "long", "short", "short", "short", "short", "short", "very short", "short", "short", "short", "long", "very short", "short", "long", "short", "very long", "long", "long", "very long", "very short", "very long", "long", "very long", "long", "long", "long", "short", "short", "long", "long", "very long", "very long", "short", "long", "short", "very long", "long", "long", "very long", "long", "short", "long", "very long", "very long", "very long", "long", "long", "short", "very long", "long", "long", "short", "long", "long", "long", "short", "long", "long", "long", "long", "long", "long", "short"])
test_discretization(dfIris.values[:,1], [2.8, 3.6], ["short", "medium", "long"], ["medium", "medium", "medium", "medium", "medium", "long", "medium", "medium", "medium", "medium", "long", "medium", "medium", "medium", "long", "long", "long", "medium", "long", "long", "medium", "long", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "long", "long", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "short", "medium", "medium", "long", "medium", "long", "medium", "long", "medium", "medium", "medium", "medium", "short", "short", "short", "medium", "short", "medium", "short", "short", "medium", "short", "medium", "medium", "medium", "medium", "short", "short", "short", "medium", "short", "short", "short", "medium", "medium", "short", "medium", "medium", "short", "short", "short", "short", "short", "medium", "medium", "medium", "short", "medium", "short", "short", "medium", "short", "short", "short", "medium", "medium", "medium", "short", "short", "medium", "short", "medium", "medium", "medium", "medium", "short", "medium", "short", "medium", "medium", "short", "medium", "short", "short", "medium", "medium", "long", "short", "short", "medium", "short", "short", "short", "medium", "medium", "short", "medium", "short", "medium", "short", "long", "short", "short", "short", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "short", "medium", "medium", "medium", "short", "medium", "medium", "medium"])
test_equal_length_discretization(dfIrisTest.values[:,0], 4, np.array(["c0", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c1", "c1", "c1", "c0", "c1", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c0", "c1", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c0", "c2", "c2", "c2", "c1", "c2", "c1", "c2", "c0", "c2", "c0", "c0", "c1", "c1", "c1", "c1", "c2", "c1", "c1", "c2", "c1", "c1", "c1", "c2", "c1", "c2", "c2", "c2", "c2", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c2", "c2", "c1", "c1", "c1", "c1", "c1", "c0", "c1", "c1", "c1", "c2", "c0", "c1", "c2", "c1", "c3", "c2", "c2", "c3", "c0", "c3", "c2", "c3", "c2", "c2", "c2", "c1", "c1", "c2", "c2", "c3", "c3", "c1", "c2", "c1", "c3", "c2", "c2", "c3", "c2", "c1", "c2", "c3", "c3", "c3", "c2", "c2", "c1", "c3", "c2", "c2", "c1", "c2", "c2", "c2", "c1", "c2", "c2", "c2", "c2", "c2", "c2", "c1"]))
test_equal_count_discretization(dfIrisTest.values[:,0], 4, np.array(["c0", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c1", "c1", "c1", "c0", "c1", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c1", "c0", "c0", "c1", "c1", "c1", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c0", "c3", "c2", "c3", "c1", "c3", "c1", "c2", "c0", "c3", "c1", "c0", "c2", "c2", "c2", "c1", "c3", "c1", "c1", "c2", "c1", "c2", "c2", "c2", "c2", "c2", "c3", "c3", "c3", "c2", "c1", "c1", "c1", "c1", "c2", "c1", "c2", "c3", "c2", "c1", "c1", "c1", "c2", "c1", "c0", "c1", "c1", "c1", "c2", "c0", "c1", "c2", "c1", "c3", "c2", "c3", "c3", "c0", "c3", "c3", "c3", "c3", "c2", "c3", "c1", "c1", "c2", "c3", "c3", "c3", "c2", "c3", "c1", "c3", "c2", "c3", "c3", "c2", "c2", "c2", "c3", "c3", "c3", "c2", "c2", "c2", "c3", "c2", "c2", "c2", "c3", "c3", "c3", "c1", "c3", "c3", "c3", "c2", "c3", "c2", "c2"]))

### 3
In this exercise we want to check the independence of categorical attributes.

#### a)
Write a function getContingencyTable(M) that receives a 2D numpy array with
two columns and computes a table containing the absolute observed frequenties of
the pairs of occuring values.

In [3]:
def getContingencyTable(M):
  M_transposed = M.T
  M_zipped = list(zip(M_transposed[0], M_transposed[1]))
  y_categories, x_categories = (set(M_transposed[0]), set(M_transposed[1]))
  products = list(itertools.product(y_categories, x_categories))
  categories = dict(zip(products, np.array([0] * len(products))))
  y_index = {item:index for index, item in enumerate(y_categories)} 
  x_index = {item:index for index, item in enumerate(x_categories)}
  result = np.zeros((len(y_categories), len(x_categories)), dtype='int')
  for item in M_zipped:
    categories[item] = categories[item] + 1
  for categorie in categories.items():
    result[y_index[categorie[0][0]]][x_index[categorie[0][1]]] = categorie[1]
  return result

#### b)
Write a function computeExpectedOccurrences(ct) that receives a contingency
table and computes a table containing, for each pair of values, the number of
occurences one would expect given independency of the attributes.

In [4]:
def computeExpectedOccurrences(ct):
    total = sum(np.sum(ct, axis=0))
    ct_list = ct.tolist()
    ct_list_transposed = np.transpose(ct_list).tolist()

    result = ct_list.copy()

    for i in range(len(ct_list)):
        row = np.sum(ct_list[i], dtype=int)
        for j in range(len(ct_list_transposed)):
            column = np.sum(ct_list_transposed[j], dtype=int)

            result[i][j] = ( row * column ) / total

    return np.array(result)

#### c)
Write a function computeChiSquare(M) that receives a 2D numpy array with two
discrete columns and computes the χ2score of the two attributes.

In [5]:
def computeChiSquare(M):
    O = getContingencyTable(M)
    E = computeExpectedOccurrences(O)
    X2 = 0
    
    for i in range(len(O)):
        for j in range(len(O[0])):
            o = O[i][j]
            e = E[i][j]
            X2 += ( (o - e ) ** 2 ) / e

    return X2

#### Tests

In [158]:
def test_contingency_table(col1, col2):
    print("-----------------\nCheck for Contingency Table\n-----------------")
    M = np.array([col1, col2]).T
    ct = getContingencyTable(M)
    s2 = np.sum(ct, axis=0)
    s1 = np.sum(ct, axis=1)
    print("Dimensionality: " + ("OK" if len(s1) == len(np.unique(col1)) and len(s2) == len(np.unique(col2)) else "FAIL (expected dimension " + str(len(np.unique(col1))) + " x " + str(len(np.unique(col2))) + " but observed " + str(len(s1)) + " x " + str(len(s2)) + ")"))
    print("Sum 1: " + ("OK" if sum(s1) == len(col1) else "FAIL"))
    print("Sum 2: " + ("OK" if sum(s2) == len(col2) else "FAIL"))
    
def test_expected_table(ct):
    print("-----------------\nCheck for Expected Count Table\n-----------------")
    cs1 = np.sum(ct, axis=1)
    cs2 = np.sum(ct, axis=0)
    et = computeExpectedOccurrences(ct)
    s2 = np.sum(et, axis=0)
    s1 = np.sum(et, axis=1)
    print("Dimensionality: " + ("OK" if et.shape == ct.shape else "FAIL (expected dimension " + str(ct.shape[0]) + " x " + str(ct.shape[1]) + " but observed " + str(et.shape[0]) + " x " + str(et.shape[1]) + ")"))
    print("Sum 1: " + ("OK" if len(cs1) == len(s1) and all(np.isclose(cs1, s1)) else "FAIL"))
    print("Sum 2: " + ("OK" if len(cs2) == len(s2) and all(np.isclose(cs2, s2)) else "FAIL"))
    
def test_chi2(M, expected):
    print("-----------------\nChi²-Test\n-----------------")
    chi2 = computeChiSquare(M)
    print ("Chi²: " + ("OK" if np.round(chi2, 2) == expected else "FAIL"))

    
# reproduce results from the lecture
dfIrisDisc = pd.read_csv("iris_discretized_projected.csv")
test_contingency_table(dfIrisDisc.values[:,0], dfIrisDisc.values[:,1])
test_expected_table(getContingencyTable(np.array([dfIrisDisc.values[:,0], dfIrisDisc.values[:,1]]).T))
test_chi2(np.array([dfIrisDisc.values[:,0], dfIrisDisc.values[:,1]]).T, 21.8)

#### d)
Write a function checkIndependence(df, c1, c2, alpha) that receives a Pandas
DataFrame and the names of two columns and that returns true iff the indepen-
dence hypothesis is sustained in a χ2test (considering the appropriate degree of
freedom) for a given confidence threshold α for the p-value.

In [47]:
# from scipy.stats import chi2
import scipy.stats as stats
data = pd.read_csv('iris.csv')

def checkIndepence(df, c1,c2, alpha):
    M = df[[c1,c2]].values
    # O = getContingencyTable(M)
    # E = computeExpectedOccurrences(O)
    X2 = computeChiSquare(M) 
    q = (len(M) - 1) * (len(M[0]) - 1)

    ppf = stats.chi2.ppf(alpha, q)
    print(ppf)
    print(q, X2)

    return True if X2 < ppf else False


checkIndepence(data, 'sepal_length', 'sepal_width', 0.7)

157.5469673873229
149 791.7225565439824


False

#### e)
Then check independence hypothesis for all pairs of categorical variables of the
credit dataset. Plot the χ2curve for every pair of categorical variables with the
respective (given) critical point (we assume α = 0.01).
Are there pairs of independent variables?