In [2]:
import pandas as pd
import numpy as np
from pandas import read_csv
import csv
import os
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn import manifold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR, SVC
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.spatial.distance import cdist
from sklearn.preprocessing import Imputer
from fancyimpute import SoftImpute
import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import re
from scipy import stats
import glmnet_python
from glmnet import glmnet; from glmnetPlot import glmnetPlot 
from glmnetPrint import glmnetPrint; from glmnetCoef import glmnetCoef; from glmnetPredict import glmnetPredict
from cvglmnet import cvglmnet; from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot; from cvglmnetPredict import cvglmnetPredict
import math

Using TensorFlow backend.
  return f(*args, **kwds)
  from pandas.core import datetools


# Main Flow

In [3]:
#Generate cross correlation file
filename = '../data/uganda_2013_cleaned.csv'
imp_df = read_csv(filename)
imp_cols = imp_df.columns.values
corr = imp_df.corr('spearman')
corr = imp_df.corr('spearman')['crop_sales___output']
corr.to_csv('../data/uganda_corr.csv',  header=['crop_sales___output'])

In [4]:
imp_df = read_csv(filename)
cdf = imp_df.loc[imp_df['crop_sales___output'].dropna().index] #___tanzania_2014
# Print missingness values for relevant inputs.
missing = 100-(cdf.apply(lambda x: x.count(), axis=0)/len(cdf)*100.0)
print (missing[missing > 1])

#Plot variations of input features w.r.t output to pick variables to cluster on.
varies = imp_df.groupby(pd.qcut(imp_df['crop_sales___output'],5,duplicates='drop')).mean()
plt.clf()
varies.plot(x='crop_sales___output', subplots=True,legend=True, figsize=(50,200),kind='bar',fontsize=20)
plt.savefig('../figures/uganda_variations.pdf')

children_education___output              16.429354
expenditure___output                     59.802848
has_hired_workers___policy                3.559693
number_of_animals_owned___policy          3.614458
number_of_days_hired_workers___policy     3.559693
number_of_hired_workers___policy          3.559693
owns_land_certificate___policy           10.240964
uses_irrigation___policy                 23.767798
land_surface                              1.204819
dtype: float64


In [5]:
# Reset matplotlib defaults if plots are skewed after generating variations.pdf.
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams.update(mpl.rcParamsDefault)

In [8]:
imp_df = read_csv('../data/uganda_2013_cleaned.csv')
imp_cols = imp_df.columns.values

def select_all_year_feats():
    imp_feats = set([])
    for y in [2013]:
        y_reg = re.compile('.*___policy.*' + str(y) + '$')
        y_cols = set(filter(y_reg.search, imp_cols))
        y_cols = set([t.replace('___uganda_' + str(y), '') for t in y_cols])
        if len(imp_feats) == 0:
            imp_feats = y_cols
        else:
            imp_feats = imp_feats.intersection(y_cols)
    return imp_feats

def select_corr_feats(thres = 0.1, only_policy=False):
    ccs = pd.read_csv('../data/uganda_corr.csv')
    output = 'crop_sales___output'
    ccs[output] = ccs[output].apply(lambda x: abs(x))
    ccs = ccs[ccs[output] > thres]
    imp_feats = ccs['Unnamed: 0']
    non_raw_reg = re.compile('^((?!lives_in|longitude|latitude|output).)*$')
    imp_feats = list(filter(non_raw_reg.search, imp_feats))
    if only_policy:
        y_reg = re.compile('.*___policy.*$')
        imp_feats = list(filter(y_reg.search, imp_feats))
    return imp_feats

def normalize():
    raw_df = read_csv('../data/uganda_2013_cleaned.csv')
    output = 'crop_sales___output'
    raw_df = raw_df.loc[raw_df[output].dropna().index]
    raw_imp = Imputer(strategy="mean")
    raw_completed = raw_imp.fit_transform(raw_df)
    raw_cols = raw_df.columns.values
    raw_df = pd.DataFrame(raw_completed,columns=raw_cols)
    return raw_df

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif_score

def spearman():
    collinear = raw_df[feats].corr('spearman')
    cols = collinear.columns
    for c in collinear.columns:
        for v in zip(cols, collinear[c]):
            if v[1] > 0.4 and v[1] < 0.5 and v[1] != 1:
                print (c, v[0], v[1])
    
def vif(feats, raw_df, thres=5, debug=False):    
    while True:
        policy = raw_df[feats].as_matrix()
        max_vif = 0
        max_vif_feat = None
        for i, f in enumerate(feats):
            if max_vif < vif_score(policy, i):
                max_vif = vif_score(policy, i)
                max_vif_feat = f
        if max_vif < thres:
            break
        feat_set = set(feats)
        feat_set.remove(max_vif_feat)
        if debug:
            print ('Removed: {0} : {1}'.format(max_vif_feat, max_vif))
        feats = list(feat_set)
    if debug:
        print ('\nFinal chosen features \n')
        for i, f in enumerate(feats):
            print (f, vif_score(policy, i))
    
    return feats

raw_df = normalize()
vifs = []

def grid_search():
    c_thresholds = [0.05, 0.07, 0.1, 0.12, 0.14,  0.15, 0.16, 0.17, 0.2,0.25,0.3]
    v_thresholds = [2,3,4,5]
    for thres in thresholds:
        imp_feats = select_corr_feats(0.1)
        v = vif(imp_feats, raw_df, thres)
        vifs += [len(v)]

c = 0.05
v = 1.5
imp_feats = select_corr_feats(c)
all_relevant = vif(imp_feats, raw_df, v, True)

imp_feats = select_corr_feats(c, True)
policy_relevant = vif(imp_feats, raw_df, v, True)

Removed: attended_school : 18.69898270071554
Removed: number_of_animals_owned___policy : 17.664015857164394
Removed: number_of_tools_owned___policy : 6.4455528947018275
Removed: household_size : 5.22403056304593
Removed: crop_diversification___policy : 4.71536467411384
Removed: literacy : 4.064549666228747
Removed: number_of_hired_workers___policy : 3.91340156406197
Removed: has_hired_workers___policy : 2.2621806823312527
Removed: household_head_is_male : 2.055081254069203
Removed: land_surface : 1.5906669251463286

Final chosen features 

quantity_of_improved_seeds___policy 1.02027042423
owns_land_certificate___policy 1.11905326438
number_of_days_hired_workers___policy 1.15338734157
number_of_ploughs_owned___policy 1.12084109816
household_head_is_widowed 1.07705207062
number_of_cows_owned___policy 1.05247198906
has_borrowed___policy 1.12834866273
quantity_of_fertilizers_used___policy 1.04659267965
quantity_of_pesticides_used___policy 1.00798227542
Removed: number_of_animals_owned___po

In [10]:
country = 'uganda'

def get_classes(output_var, pred):
    max_bins = 3
    _, boundaries = np.histogram(output_var, bins=max_bins)
    classes = np.digitize(pred, bins=boundaries)
    return classes, max_bins

def for_year(var, year):
    return var + '___' + country + '_' + str(year)

def run_regressions(fixed_k, in_name, out_dir, non_policy_inputs, segment_variables, inputs, output, year):
    global table
    global coef_table
    global avg_table
    global coef_map
    # table for regressions and classification
    table = pd.DataFrame()
    avg_table = pd.DataFrame()
    coef_map = {}
    # create table of coefficients
    coef_table = pd.DataFrame()
    
    try:
        os.mkdir(out_dir)
    except:
        print("Dir exists")
    
    df = read_csv(in_name)
    df = df.loc[df[output].dropna().index] # drop rows with unobserved income
    df = df.loc[df['weight'].dropna().index]
    df = df.loc[df[output] != 0]
    # Transform input
    logged_inputs = ['crop_sales___output', 'expenditure___output', 'crop_diversification___policy', 'number_of_animals_owned___policy', 'number_of_hired_workers___policy', 'quantity_of_fertilizers_used___policy', 'quantity_of_pesticides_used___policy', 'household_size', 'land_surface']
    for inp in logged_inputs:
        df[inp] = df[inp].apply(lambda x: np.log(1+x))

    df['nid']= df.index.tolist()

    # select % of data in test set
    test_split = 0.2
    
    imp = Imputer(strategy="mean")
    completed = imp.fit_transform(df)
    
    # reconstruct dataframe with completed matrix
    cols = df.columns.values
    mat = pd.DataFrame(completed,columns=cols)
    
    # Redo the same, but without the transformations, only matrix completion for raw matrix.
    raw_df = read_csv(in_name)
    raw_df = raw_df.loc[raw_df[output].dropna().index]
    raw_df = raw_df.loc[raw_df['weight'].dropna().index]
    raw_df = raw_df.loc[raw_df[output] != 0]
    raw_df['nid']= raw_df.index.tolist()
    raw_imp = Imputer(strategy="mean")
    raw_completed = raw_imp.fit_transform(raw_df)
    raw_cols = raw_df.columns.values
    raw_df = pd.DataFrame(raw_completed,columns=raw_cols)
    raw_df['productivity'] = raw_df['crop_sales___output']/raw_df['land_surface']
    raw_df['productivity'] = raw_df['productivity'].apply(lambda x: 0 if x == np.inf else x)
    
    # z-score the matrix mat used for clustering/regression.
    mat_scaled = StandardScaler()
    mat_scaled.fit(mat)
    mat_sc = mat_scaled.transform(mat)
    mat = pd.DataFrame(mat_sc, columns=mat.columns)
    
    y = mat[output]
    x = mat[inputs]
    
    def update_best_lambda(x_scaled):
        copy_y = np.array(y, dtype=np.float64)
        print (copy_y)
        fit = cvglmnet(x = x_scaled.copy(), y = copy_y)
        print(fit['lambda_min'])
        return fit['lambda_min']
    
    # Split test/train
    indices = range(len(mat))
    x_train, x_test, y_train, y_test, ind_train, ind_test = \
        train_test_split(x, y, indices, test_size=test_split, random_state=42)
    
    def get_train_test(input_vars):
        x = mat[input_vars].copy()
        x_scaled = StandardScaler()
        x_scaled.fit(x)
        x_sc = x_scaled.transform(x)
        # reconstruct DataFrame
        x = pd.DataFrame(x_sc, columns=x.columns)
        training_x = x.iloc[ind_train, :]
        testing_x = x.iloc[ind_test, :]
        return x_sc, training_x, testing_x
    
    def digitize(output_var, pred):
        from sklearn.preprocessing import label_binarize
        classes, max_bins = get_classes(output_var, pred)
        b_classes = label_binarize(classes, range(max_bins))
        return b_classes
    
    def calc_unsegmented(baseline, x_train, x_test): 
        global table
        global coef_map
        # reg keeps predictions from regressions along with keys
        reg = dict()
        name = 'OLS'
        reg[name] = {}

        # keys and values from test data
        keys_list = []
        y_list = []
        for k, v in y_test.iteritems():
            keys_list.append(k)
            y_list.append(v)

        # run regressions on full dataset
        name = 'OLS'
        model = sm.OLS(y_train, x_train)
        fit = model.fit_regularized(alpha=1e-5, refit=True)
        y_pred = fit.predict(x_test)

        # add predictions to dict
        for i, p in enumerate(y_pred):
            t = reg[name]
            t[keys_list[i]] = p

        try:
            test_c = digitize(y, y_test)
            pred_c = digitize(y, y_pred)
            auc_c = roc_auc_score(test_c, pred_c, average='macro')
        except ValueError:
            auc_c = 0.5
        mse = mean_squared_error(y_test,y_pred)
        scaled_mse = (mse/np.std(y))

        # add row to table
        new_row = pd.DataFrame({'model': name, 'segment': '', 'input': baseline, 'scaled_mse': scaled_mse, 'mse': mse, 'roc_auc': auc_c, 'clustered': False}, index=[0])
        table = table.append(new_row, ignore_index=True)

        # add coefficients to map
        coef_map[name + '_' + baseline] = fit.params
        lower_bounds = []
        upper_bounds = []
        for ci in fit.conf_int():
            lower_bounds += [ci[0]]
            upper_bounds += [ci[1]]
        coef_map[name + '_' + baseline + '_lower_bound'] = lower_bounds
        coef_map[name + '_' + baseline + '_upper_bound'] = upper_bounds
    
    def calc_segmented(segment_variables, baseline, x_train, x_test):
        global table
        global coef_map
        global avg_table
        segment_vars = list(segment_variables.keys())
        
        def add_clusters():
            # elbow method
            sse = []
            seg_data = mat[segment_vars]
            for seg_var in segment_vars:
                seg_data[seg_var] = seg_data[seg_var].apply(lambda x: x*segment_variables[seg_var])
            for k in range(1,9):
                kmeans = KMeans(n_clusters=k).fit(seg_data)
                labels = kmeans.labels_
                sse.append(sum(np.min(cdist(seg_data, kmeans.cluster_centers_, 'euclidean'), axis=1)) / seg_data.shape[0])

            # K-means elbow calculation
            plt.clf()
            plt.plot(range(1,9), sse)
            plt.xlabel('k')
            plt.ylabel('Sum of squared error')
            plt.savefig(os.path.join(out_dir, 'elbow.png'))
            print(sse)
            min_k = sse.index(min(sse))
            print (min_k)
            
            # K-means fixed K calculation.
            min_k = fixed_k
            kmeans = KMeans(n_clusters=min_k).fit(seg_data)

            labels = kmeans.labels_
            mat['cluster'] = labels
            # Sort cluster labels in order of mean of output within cluster.
            means = []
            for i in np.unique(labels):
                sel_mat = mat[mat['cluster'] == i]
                raw_clus = raw_df.loc[sel_mat.index]
                values = list(raw_clus[output].as_matrix())
                # Weighted average can be done if the weight column exists.
                weights = list(raw_clus['weight'].as_matrix())
                average = np.average(values, weights=weights)
                means.append(average)
            sorted_ids = [i[0] for i in sorted(enumerate(means), key=lambda x:x[1])]
            print(sorted_ids)
            mat['cluster'] = mat['cluster'].apply(lambda x: sorted_ids.index(x))

            # Output ids of households, their cluster number, variables on which cluster is done along with lat/long if exists
            # select_variables += ['latitude___ethiopia_2015', 'longitude___ethiopia_2015']
            # Note (Sam): This is where the file I give you with Ids, cluster numbers is written.
            # The baseline = relevant variables
            select_variables = [output, 'y4_hhid'] + segment_vars
            select_variables = [t.replace('norm', 'raw') for t in select_variables]
            all_output = pd.concat([mat['cluster'], raw_df[select_variables]], 1)
            all_output.to_csv(os.path.join(out_dir,'clus_' + baseline + '_' + output + '.csv'))
            return min_k
            
        
        # Add segments based on median.
        def add_segments():
            median_segments = {}
            for seg_var in segment_vars:
                #binary
                if len(np.unique(mat[seg_var])) <= 3:
                    median_segments[seg_var] = 0
                else:
                    median_segments[seg_var] = np.median(mat[seg_var])
            mat['cluster'] = 0
            for seg_var in segment_vars:
                mat['cluster'] = 2*mat['cluster'] + [int(x) for x in mat[seg_var] > median_segments[seg_var]]
            #mat[['cluster'] + segment_vars].to_csv(os.path.join(out_dir,'segment_' + ','.join(segment_vars) + '_' + baseline + '_' + output + '.csv'))
            return int(math.pow(2, len(segment_vars)))

        # Segments based solely on location.
        def add_location_segments():
            locations = ['afar', 'amhara', 'benishangul_gumuz', 'dire_dawa', 'gambella', 'harari', 'oromiya', 'snnp', 'somalie', 'tigray']
            i = 0
            mat['cluster'] = 0
            for l in sorted(locations):
                loc_feature = 'lives_in_' + l + '___ethiopia_' + str(year)
                loc_val = mat[loc_feature].apply(lambda x: 0 if x < 0 else 1)
                mat['cluster'] = mat['cluster'] + (i*loc_val)
                i += 1
            return len(locations) + 1
            
        def _run(max_clusters, method_name):
            global table
            global avg_table
            global coef_map
            # reg_clus keeps predictions from clustered regressions along with keys
            reg_clus = dict()

            name = 'OLS'
            reg_clus[name] = {}

            # need new dataframes with only training and test rows.
            # we use this when looping through clusters
            train_mat = mat.loc[ind_train]
            test_mat = mat.loc[ind_test]
            train_size = len(train_mat)
            
            series = {}
            series[output] = []
            for seg in segment_vars:
                series[seg] = []
            series = pd.DataFrame()
            row = {}
            raw_cols = raw_df.columns.values
            raw_reg = re.compile('^((?!norm).)*$') #+ str(year) +
            # avg_variables is set of all variables whose mean, 25%ile, 75%ile, stddev, stderr stats are written to *_avg file.
            avg_variables = list(filter(raw_reg.search, raw_cols))
            for i in range(max_clusters):
                train_clus = x_train.loc[train_mat['cluster'] == i]
                train_y = y_train.loc[train_mat['cluster'] == i]
                test_clus = x_test.loc[test_mat['cluster'] == i]
                test_y = y_test.loc[test_mat['cluster'] == i]
                sel_mat = mat[mat['cluster'] == i]
                raw_clus = raw_df.loc[sel_mat.index]

                for seg in avg_variables:
                    values = raw_clus[seg].as_matrix()
                    # Weighted average if weight column exists.
                    weights = raw_clus['weight'].as_matrix()
                    average = np.average(values, weights=weights)
                    row['mean_' + seg] = average
                    variance = np.average((values-average)**2, weights=weights)
                    row['stddev_' + seg] = math.sqrt(variance)
                    row['stderr_' + seg] = math.sqrt(variance)/math.sqrt(len(values))
                    row['25ile_' + seg] = np.percentile(values, 25)
                    row['75ile_' + seg] = np.percentile(values, 75)
                
                row['index'] = i
                row['size'] = len(raw_clus)
                new_row = pd.DataFrame(row, index=[0])
                series = series.append(new_row, ignore_index=True)
                avg_table = avg_table.append(new_row, ignore_index=True)
                cluster_percent = (len(train_clus)*100.0)/train_size
                if train_clus.empty or test_clus.empty:
                    continue

                keys_list = []
                y_list = []
                for k, v in test_y.iteritems():
                    keys_list.append(k)
                    y_list.append(v)

                # Regress per cluster
                name = 'OLS'
                model = sm.OLS(train_y, train_clus)
                fit = model.fit_regularized(alpha=1e-5, refit=True)
                y_pred = fit.predict(test_clus)

                for a, b in enumerate(y_pred):
                    t = reg_clus[name]
                    t[keys_list[a]] = b

                coef_map[name + '_' + method_name + '_' + ','.join(segment_vars) + '_' + str(i)] = fit.params
                lower_bounds = []
                upper_bounds = []
                for ci in fit.conf_int():
                    lower_bounds += [ci[0]]
                    upper_bounds += [ci[1]]
                coef_map[name + '_' + method_name + '_' + str(i) + '_lower_bound'] = lower_bounds
                coef_map[name + '_' + method_name + '_' + str(i) + '_upper_bound'] = upper_bounds
            
            # plot sorted correlation
            sorted_series = series.sort_values(['mean_' + output])
            for seg in avg_variables:
                plt.clf()
                plt.plot(sorted_series['mean_' + output].as_matrix(), sorted_series['mean_'+seg].as_matrix(), marker='o')
                plt.xlabel('Average ' + output.replace('___output___' + country + '_' + str(year), '') + ' output')
                plt.ylabel('Average ' + seg.replace('___policy___'  + country + '_' +str(year), '').replace('_',' '))
                plt.savefig(os.path.join(out_dir, 'plot_' + seg + '_' + output + '.pdf'))
                
            # add mse's to table
            keys = sorted(y_test.keys())
            name = 'OLS'
            sort_t = []
            sort_p = []

            for key in keys:
                if key not in y_test or key not in reg_clus[name]:
                    continue
                sort_t.append(reg_clus[name][key])
                sort_p.append(y_test[key])

            try:
                test_c = digitize(y, sort_t)
                pred_c = digitize(y, sort_p)
                auc_c = roc_auc_score(test_c, pred_c, average='macro')
            except ValueError:
                auc_c = 0.5
            mse = mean_squared_error(sort_t,sort_p)
            scaled_mse = (mse/np.std(y))
            new_row = pd.DataFrame({'model': name, 'segment': ','.join(segment_vars), 'input': baseline, 'scaled_mse': scaled_mse, 'mse': mse, 'roc_auc': auc_c, 'clustered': True, 'method': method_name}, index=[0])
            table = table.append(new_row, ignore_index=True)
                
    
        ##### Run grouped regressions
        if (len(segment_vars) > 1):
            _run(add_clusters(), 'clustered')
        #_run(add_location_segments(), 'segmented')
        #_run(add_segments(), 'segmented')
    
    def run_with_inputs(input_vars, name):
        # map to be used in tracking coefficients
        global coef_map
        global coef_table
        global x_train
        global x_test
        coef_map = {}
        x_scaled, x_train, x_test = get_train_test(input_vars)
        # Update Lasso Lambda using GLMNET.
        update_best_lambda(x_scaled)
        calc_unsegmented(name, x_train, x_test)
        calc_segmented(segment_variables, name, x_train, x_test)
        
        for k,v in sorted(coef_map.items()):
            kvp = dict()
            kvp['model'] = k
            kvp['inputs'] = name

            for val,invar in zip(v,input_vars):
                kvp[invar] = val

            new_row = pd.DataFrame(kvp, index=[0])
            coef_table = coef_table.append(new_row, ignore_index=True)
    
    # Baseline 1
    # input_vars = inputs + non_policy_inputs
    # run_with_inputs(input_vars, 'All variables')

    global all_relevant, policy_relevant
    
    print ('Relevant variables')
    print(all_relevant, policy_relevant)
    run_with_inputs(all_relevant, 'Highly correlated all')
    run_with_inputs(policy_relevant, 'Highly correlated policy')
    
    # save coefficient and output tablesdrop
    coef_table.to_csv(os.path.join(out_dir,'coef_' + output + '.csv'))
    table.to_csv(os.path.join(out_dir,output + '.csv'))
    avg_table.to_csv(os.path.join(out_dir,output + '_avg' + '.csv'))
    

# Functions to just populate clusters across years.

def get_segments(df, output):
    df_t = df.loc[df[output].dropna().index]
    df['segment_' + output], _ = get_classes(df_t[output], df[output])
    return df

def complete(df):
    imp = Imputer(strategy="mean")
    completed = imp.fit_transform(df)
    cols = df.columns.values
    mat = pd.DataFrame(completed,columns=cols)
    mat_scaled = StandardScaler()
    mat_scaled.fit(mat)
    mat_sc = mat_scaled.transform(mat)
    mat = pd.DataFrame(mat_sc, columns=mat.columns)
    return mat

# Although this repeats calculation done above with clustering/regression, it allows us to add columns per year
# in the same dataframe.
def get_clusters(mat, output, segment_vars):
    segment_vars = list(segment_vars.keys())
    seg_data = mat[segment_vars]
    min_k = 4
    kmeans = KMeans(n_clusters=min_k).fit(seg_data)
    labels = kmeans.labels_
    means = []
    for i in np.unique(labels):
        df_clus = mat.loc[labels == i]
        means.append(np.mean(df_clus[output].as_matrix()))
    sorted_ids = [i[0] for i in sorted(enumerate(means), key=lambda x:x[1])]
    mat['segment_'+ output] = labels
    mat['segment_'+ output] = mat['segment_'+ output].apply(lambda x: sorted_ids[x])
    return mat

In [11]:
import re

df = pd.read_csv(filename)
country = 'uganda'

# change to true if you want to use all input fields
def get_vars(year):
    year_vars = df.columns.values
    out_reg = re.compile('.*' + base_output + '___output.*$')
    outputs = list(filter(out_reg.search, year_vars))
    policy_reg = re.compile('(.*___policy.*)$')
    policy_inputs = list(filter(policy_reg.search, year_vars))
    non_policy_reg = re.compile('^((?!policy|output|weight).)*$')
    non_policy_inputs = list(filter(non_policy_reg.search, year_vars))
    
    return outputs, policy_inputs, non_policy_inputs

# When true, df will contain columns for clusters across years, which then can be used to calculate
# evidence of change across years and agreement numbers.
generate_multiple_years_clusters = False
if generate_multiple_years_clusters:
    years = [2009, 2011, 2013]
    raise "Not supported"
else:
    years = [2013]

base_output = 'crop_sales'
# Choose variables to segment on based on correlation file.
ccs = pd.read_csv(country + '_corr.csv')
output = base_output + '___output'
ccs[output] = ccs[output].apply(lambda x: abs(x))

# We chose clustering features based on the iterative approach.
# Note(Sam): Modifying this regex will change variables to cluster on.
select = ccs['Unnamed: 0'].str.contains('^.*(land_surface|number_of_days_hired|crop_diver|tools|mono)')
ccs = ccs[select]
ccs = ccs.sort_values(output, ascending=False)
num_vars=8
best_vars = ccs[['Unnamed: 0', output]][:num_vars].as_matrix()
segment_variables = {}
seg_vars = []
for i in best_vars:
    name = i[0]
    segment_variables[name] = i[1]

print(segment_variables)

raw_df = df.copy()
df = complete(df)
for year in years:
    outputs, policy_inputs, non_policy_inputs = get_vars(year)
    for output in outputs:
        if generate_multiple_years_clusters:
            df = get_clusters(df, output, segment_variables)
        run_regressions(fixed_k=4, in_name=filename, out_dir='../results/uganda_v2_' + base_output, non_policy_inputs=non_policy_inputs, segment_variables=segment_variables, inputs=policy_inputs, output=output, year=year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


{'land_surface': 0.4354288395677967, 'number_of_tools_owned___policy': 0.3444603945768074, 'crop_diversification___policy': 0.33465334152305737, 'number_of_days_hired_workers___policy': 0.14621648709178992, 'household_head_is_monogamous': 0.043576416427216144}
Dir exists
Relevant variables
['quantity_of_improved_seeds___policy', 'owns_land_certificate___policy', 'number_of_days_hired_workers___policy', 'number_of_ploughs_owned___policy', 'household_head_is_widowed', 'number_of_cows_owned___policy', 'has_borrowed___policy', 'quantity_of_fertilizers_used___policy', 'quantity_of_pesticides_used___policy'] ['quantity_of_improved_seeds___policy', 'owns_land_certificate___policy', 'number_of_days_hired_workers___policy', 'number_of_ploughs_owned___policy', 'number_of_cows_owned___policy', 'has_borrowed___policy', 'quantity_of_fertilizers_used___policy', 'quantity_of_pesticides_used___policy']
[ 0.19456129  0.74992855  1.41741665 ...,  0.61468031  0.13070197
  1.42912404]
[ 0.00892927]
[0.590

In [None]:
### Below cells are still not ready. Will tweak as we get more data for Tanzania.
assert generate_multiple_years_clusters==True, "Below cells are not supported"

# Run this once to avoid overwriting df in the code below.
df_all = df

In [602]:
# Compute evidence of movement across years, the lift due to movement in relevant inputs.
df = df_all
series = pd.DataFrame()
for imp_feat in imp_feats:
    for output in ['segment_crop_sales___output']:
        years = ['2011', '2013', '2015']
        raw_output = 'crop_sales___output'
        coef= {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1}
        for [y1, y2] in [['2011', '2013'], ['2013', '2015']]:
            #try:
            z1 = for_year(output, y1)
            z2 = for_year(output, y2)
            r1 = for_year(raw_output, y1)
            r2 = for_year(raw_output, y2)
            f1 = for_year(imp_feat,y1)
            f2 = for_year(imp_feat,y2)
            df[output + '_change' + y1] = (df[z1]!=df[z2])
            df[output + '_increase' + y1] = (df[z1]<df[z2])
            df[output + '_decrease' + y1] = (df[z1]>df[z2])
            expected = df[z1].apply(lambda x: coef[x])
            df[imp_feat+'_increase'+y1] = (imp_df[f1]<imp_df[f2])
            df[output + '_inversion' + y1] = (df[z2]-df[z1])*(df[f2]-df[f1])*expected
            df[output + '_change_value' + y1] = (imp_df[r2]-imp_df[r1])

        for per in [50,55,60,70,75,80,85,90,95]:
            for seg in range(4):
                exp_y_i = []
                exp_y = []
                exp = []
                exp_i = []
                for [y1, y2] in [['2011', '2013'], ['2013', '2015']]:
                    year = y1
                    weight = for_year('weight', year)
                    seg_y = for_year(output, year)
                    df_seg = df[df[seg_y]==seg]
                    df_seg = df_seg.loc[df_seg[output + '_change_value' + year].dropna().index]
                    thres = np.percentile(df_seg[output + '_change_value' + year].dropna(), per)
                    high_df = df_seg[df_seg[output + '_change_value' + year].apply(lambda x : x >= thres)]
                    exp_y_i.append(len(high_df[(high_df[imp_feat + '_increase'+year]==True)]))
                    exp_y.append(len(high_df))
                    exp.append(len(df_seg))
                    exp_i.append(len(df_seg[df_seg[imp_feat + '_increase'+year]==True]))
                    avg_y_i = np.mean(high_df[(high_df[imp_feat + '_increase'+year]==True)][output + '_change_value' + year])
                    avg_y = np.mean(high_df[output + '_change_value' + year].dropna())
                    avg = np.median(df[output + '_change_value' + year].dropna())
                row = {}
                row['threshold'] = per
                row['input'] = imp_feat
                row['cluster'] = seg
                row['movement overall'] = (sum(exp_y)/sum(exp))*100.0
                row['movement conditioned'] = (sum(exp_y_i)/sum(exp_i))*100.0
                row['movement lift'] = (sum(exp_y_i)/sum(exp_i))/(sum(exp_y)/sum(exp))
                new_row = pd.DataFrame(row, index=[0])
                series = series.append(new_row, ignore_index=True)

series.to_csv('../results/threshold-lift.csv')

owns_land_certificate___policy 2011 2013 0 100 6.666666666666667 11.0 1.6500000000000001 63.2173423708 98.0094046816
owns_land_certificate___policy 2011 2013 1 129 18.441558441558442 20.930232558139537 1.1349492302653128 187.719676667 98.0094046816
owns_land_certificate___policy 2011 2013 2 94 16.693944353518823 18.085106382978726 1.0833333333333335 25.8017629654 98.0094046816
owns_land_certificate___policy 2011 2013 3 38 24.369747899159663 21.052631578947366 0.8638838475499092 -8.95413798691 98.0094046816
owns_land_certificate___policy 2013 2015 0 160 12.430632630410656 10.0 0.8044642857142857 -27.3660277734 -22.2788003332
owns_land_certificate___policy 2013 2015 1 149 16.028708133971293 22.14765100671141 1.3817489732545327 3.01906953167 -22.2788003332
owns_land_certificate___policy 2013 2015 2 185 5.785123966942149 10.81081081081081 1.8687258687258688 -16.2521572793 -22.2788003332
owns_land_certificate___policy 2013 2015 3 57 22.188449848024316 21.052631578947366 0.9488103821196827 -

illness_of_household_member___policy 2013 2015 0 175 12.430632630410656 13.714285714285715 1.1032653061224489 -27.3660277734 -22.2788003332
illness_of_household_member___policy 2013 2015 1 134 16.028708133971293 16.417910447761194 1.0242815771886835 3.01906953167 -22.2788003332
illness_of_household_member___policy 2013 2015 2 338 5.785123966942149 5.029585798816568 0.8693998309382924 -16.2521572793 -22.2788003332
illness_of_household_member___policy 2013 2015 3 63 22.188449848024316 22.22222222222222 1.0015220700152205 -57.6725140427 -22.2788003332
has_health_issues___policy 2011 2013 0 509 6.666666666666667 5.893909626719057 0.8840864440078586 63.2173423708 98.0094046816
has_health_issues___policy 2011 2013 1 426 18.441558441558442 17.136150234741784 0.9292137803345897 187.719676667 98.0094046816
has_health_issues___policy 2011 2013 2 218 16.693944353518823 18.34862385321101 1.0991185465011692 25.8017629654 98.0094046816
has_health_issues___policy 2011 2013 3 85 24.369747899159663 24.

has_borrowed___policy 2013 2015 0 120 12.430632630410656 7.5 0.6033482142857143 -27.3660277734 -22.2788003332
has_borrowed___policy 2013 2015 1 99 16.028708133971293 11.11111111111111 0.693200663349917 3.01906953167 -22.2788003332
has_borrowed___policy 2013 2015 2 208 5.785123966942149 5.288461538461538 0.9141483516483516 -16.2521572793 -22.2788003332
has_borrowed___policy 2013 2015 3 35 22.188449848024316 28.57142857142857 1.2876712328767121 -57.6725140427 -22.2788003332
uses_credit___policy 2011 2013 0 66 6.666666666666667 15.151515151515152 2.272727272727273 63.2173423708 98.0094046816
uses_credit___policy 2011 2013 1 94 18.441558441558442 25.53191489361702 1.3844770752172608 187.719676667 98.0094046816
uses_credit___policy 2011 2013 2 89 16.693944353518823 14.606741573033707 0.8749724608944701 25.8017629654 98.0094046816
uses_credit___policy 2011 2013 3 21 24.369747899159663 42.857142857142854 1.7586206896551724 -8.95413798691 98.0094046816
uses_credit___policy 2013 2015 0 92 12.43