In [1]:
# Get descriptive data from a dataframe with one categorical independent variable and one dependent variable
# input dataframe (df) dependent variable (depVar) and indpendent variable (indVar)
# output strings in csv format.

def descriptive1(df, depVar, indVar):
    variables = df[depVar].unique()
    for variable in variables:
        variable_values = df[taaco[depVar] == variable][indVar]
        print(variable, ',', len(variable_values), ",", round(variable_values.mean(), 2), ",", round(variable_values.std(), 2))
    print('total', ',', len(df), ',', round(df[indVar].mean(),2), ',', round(df[indVar].std(),2))

In [2]:
# Function to prune variables. Finds collinear variables with r > .7. 
# Removes the variable least strongly correlated with dependent variable 

# input dataframe and dependent variable
# return list of rows to prune from the dataframe.

def pruner(dataframe, depVar, limit):
    rows_to_drop = []
    # Makes a correlation table
    cor_table = dataframe.corr()
    # Iterates through each column of the correlation table
    for var in cor_table.columns:
        if var != depVar:
            # Iterates through rows where the r value is higher than 0.7
            for row in list(cor_table[abs(cor_table[var]) > limit].index):
                # Ignores cases where the row and column are the same
                if row != var and row != depVar:
                    # If the absolute value of the correlation between the row variable 
                    # and the dependent variable is less than that of the column variable,
                    # add the row to a list of rows to drop.
                    if abs(cor_table.loc[depVar][row]) < abs(cor_table.loc[depVar][var]):               
                        rows_to_drop.append(row)
                    else:
                        rows_to_drop.append(var)
    # Take set of the rows to drop and return that list
    rows_to_drop = set(rows_to_drop)
    return(rows_to_drop)

In [8]:
### CHECKING PRUNER()
# This generates a 10x10 table with random values
# A is the dependent variable. Matched columns are a,b; c,d; e,f
# One cell from columns a, c, and e are changed to match the dependent variable
# In general, columns b, d, and f should drop, but there can be random effects
# If you get an unexpected result, consult the correlation table.

import random
import pandas as pd

columns = ['A', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
random_dict = {}
for col in columns:
    random_dict[col] = [random.random() for i in range(10)]
random_dict['b'] = random_dict['a']
random_dict['d'] = random_dict['c']
random_dict['f'] = random_dict['e']
frame = pd.DataFrame.from_dict(random_dict)
frame.loc[0,'a'] = frame.loc[0,'A']
frame.loc[0,'c'] = frame.loc[0,'A']
frame.loc[0,'e'] = frame.loc[0,'A']
frame

Unnamed: 0,A,a,b,c,d,e,f,g,h,i,j
0,0.326835,0.326835,0.344813,0.326835,0.716231,0.326835,0.306313,0.377553,0.455286,0.5913,0.299228
1,0.700175,0.564341,0.564341,0.525018,0.525018,0.838092,0.838092,0.281749,0.316372,0.607738,0.909108
2,0.389385,0.786717,0.786717,0.892399,0.892399,0.499348,0.499348,0.695887,0.443246,0.329292,0.89329
3,0.943782,0.101872,0.101872,0.534048,0.534048,0.436036,0.436036,0.572265,0.382849,0.121225,0.865465
4,0.588412,0.414309,0.414309,0.287307,0.287307,0.985786,0.985786,0.632218,0.209396,0.475304,0.89066
5,0.678557,0.187132,0.187132,0.579006,0.579006,0.772842,0.772842,0.932008,0.754856,0.798452,0.441029
6,0.087797,0.581872,0.581872,0.048733,0.048733,0.795976,0.795976,0.121578,0.012966,0.253237,0.017728
7,0.053295,0.088938,0.088938,0.618858,0.618858,0.993716,0.993716,0.406456,0.593853,0.908574,0.538008
8,0.377199,0.072346,0.072346,0.786476,0.786476,0.350429,0.350429,0.727996,0.717291,0.753998,0.772613
9,0.669254,0.022669,0.022669,0.368453,0.368453,0.752087,0.752087,0.851168,0.292091,0.050303,0.287765


In [9]:
frame.corr()

Unnamed: 0,A,a,b,c,d,e,f,g,h,i,j
A,1.0,-0.210299,-0.214299,0.120952,0.026106,-0.136683,-0.130091,0.478813,0.045777,-0.375776,0.474325
a,-0.210299,1.0,0.999767,-0.046897,-0.038713,0.066192,0.064943,-0.42236,-0.466924,-0.14513,0.1484
b,-0.214299,0.999767,1.0,-0.052063,-0.03319,0.055649,0.053983,-0.427464,-0.465409,-0.1424,0.141385
c,0.120952,-0.046897,-0.052063,1.0,0.877361,-0.339788,-0.329366,0.487993,0.737225,0.346696,0.636861
d,0.026106,-0.038713,-0.03319,0.877361,1.0,-0.579016,-0.578185,0.363686,0.762916,0.405964,0.475852
e,-0.136683,0.066192,0.055649,-0.339788,-0.579016,1.0,0.999752,-0.143285,-0.303667,0.167149,-0.076368
f,-0.130091,0.064943,0.053983,-0.329366,-0.578185,0.999752,1.0,-0.135162,-0.301294,0.161915,-0.067228
g,0.478813,-0.42236,-0.427464,0.487993,0.363686,-0.143285,-0.135162,1.0,0.561499,-0.011801,0.237274
h,0.045777,-0.466924,-0.465409,0.737225,0.762916,-0.303667,-0.301294,0.561499,1.0,0.680958,0.238529
i,-0.375776,-0.14513,-0.1424,0.346696,0.405964,0.167149,0.161915,-0.011801,0.680958,1.0,0.120957


In [10]:
# Should (usually) drop columns b, d, and f. 
# When you get something unexpected, consult the correlation table to see that it checks out.

print(pruner(frame, 'A', 0.7))

{'h', 'f', 'd', 'a'}


In [None]:
def ZeroRemover(df):
    columns_to_drop = []
    for column in df.columns:
        count = (df[column] == 0).sum()
        if count > len(df)*0.2:
            columns_to_drop.append(column)
    print(len(columns_to_drop))
    df = df.drop(columns_to_drop, axis='columns')
    return(df)

In [None]:
from scipy.stats import chisquare

