# Data 4 | kNN

This defines a neighborhood of the k nearest neighbors for every voter in the state and assigns to each voter the composition of this neighborhood. Since this is prohibitive, the main optimization uses the square neighborhoods to partition the matrix. This is done by computing the population of square partitions which have their centroid within some increasing radius. Then the knn algorithm runs over these selected partitions.


## Run The Code

In [1]:
from data4_knn import *

T0 = time.time()
data_name = f'{state} | {meters}m'
print_log[data_name] = {'log':[]}
printer(print_log)

use_dates = sorted([date for date in dates if int(date[:4]) >= 2010])
for date in use_dates:
    data_name = f'{date} @ {k_max}k'
    save_name = f'{date}_{k_max}k.pkl'
    print_log[data_name] = {'log':[],'sublog':[]}
    printer(print_log)
    
    """ Check If All Chunks Exist """
    
    chunk_nums = [x.strip(date) for x in os.listdir(path_1) if date+'_' in x]
    future_files = [f'knn_{date}_{k_max}k{chunk}' for chunk in chunk_nums]
    all_files_finished = all([file in os.listdir(path_4 + 'chunks') for file in future_files])

    if all_files_finished:
        print_log[data_name]['log'].append('  Done')
        printer(print_log)
        
    else:
        """ Step 1 | Proximity Matrix """
        
        t0 = time.time()
        print_log[data_name]['log'].append('  Step 1 | Proximity Matrix')
        printer(print_log)
        
        with open(path_3 + f'SNd_{date}_{meters}m.pkl','rb') as f: 
            SNd = pickle.load(f)
        
        Pd, POPd = proximity_matrix(SNd, date, print_log)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[data_name]['log'][-1] = f'  Step 1 | Proximity Matrix (Runtime: {runtime} mins)'

        """ Step 2 | Run KNN by Neighborhood """
        
        t0 = time.time()
        print_log[data_name]['log'].append('  Step 2 | Run KNN by Neighborhood')
        printer(print_log)

        def ni_knn(ni, run_NVId=False):
            """ Run find the statistics of the nearest k neighbors. """

            n, P, radius = SNd[ni], Pd[ni]['P'], Pd[ni]['radius']
            ni_voters = [SNd[p] for p in P if POPd[p]>0] + [n]
            ni_voters = pd.concat(ni_voters, ignore_index=True)

            n_lat_lon = list(zip(n.lat, n.lon))
            ni_v_lat_lon = list(zip(ni_voters.lat, ni_voters.lon))
            ni_distances = pd.DataFrame(haversine_vector(ni_v_lat_lon, n_lat_lon, Unit.METERS, comb=True))
            ni_distances.columns = ni_voters.idu.values
            ni_distances.index = n.idu.values

            def vi_knn(n, ni_voters, ni_distances):
                """ Run knn for all voters in n. """
                
                extra_variables = ['white','black',]#'M','F','age']#,'lat','lon','dist']
                knn_stats = pd.DataFrame(columns=knn_col_names(extras=extra_variables))
                nearest_dict = {}
                for vi,v in n.iterrows():
                    try: # This is to handle the rare case when there are multiple voters 
                        v_sorted = ni_distances.loc[v.idu].sort_values()
                    except:
                        v_sorted = ni_distances.loc[v.idu].iloc[0].sort_values()
                    v_0_index = v_sorted[v_sorted == 0].index
                    v_0 = ni_voters[ni_voters.idu.isin(v_0_index)]
                    v_stats = [sum(v_0[p]) for p in party_list]

                    nearest_dict[v.idu] = {}
                    for k in k_list:
                        v_knn_index = v_sorted[:k].index
                        nearest_dict[v.idu][k] = v_knn_index
                        v_knn = ni_voters[ni_voters.idu.isin(v_knn_index)]
                        v_stats += [sum(v_knn[p]) for p in party_list]
                        v_kn_index = v_sorted[k-1:k].index[0]
                        v_kn = ni_voters[ni_voters.idu == v_kn_index]
                        
                        """ Additional Variables """
                        if 'white' in extra_variables:
                            v_stats += [sum(v_knn['race']=='WHITE')]
                        if 'black' in extra_variables:
                            v_stats += [sum(v_knn['race']=='BLACK or AFRICAN AMERICAN')]
                        if 'M' in extra_variables:
                            v_stats += [sum(v_knn.gender == 'M')]
                        if 'F' in extra_variables:
                            v_stats += [sum(v_knn.gender == 'F')]
                        if 'age' in extra_variables:
                            v_stats += [v_knn.age.astype(int).mean()]
                        if 'lat' in extra_variables:
                            v_stats += [v_kn.lat]
                        if 'lon' in extra_variables:
                            v_stats += [v_kn.lon]
                        if 'dist' in extra_variables:
                            v_stats += [v_sorted[k-1]]
                        
                        # include the turnout of the KNN too ... although this might be more costly and involve a choice of which elections
                    knn_stats.loc[v.idu] = [v.idu] + v_stats
                if run_NVId:
                    return knn_stats, nearest_dict
                else:
                    return knn_stats
            return vi_knn(n, ni_voters, ni_distances)

        """ NVId saves the closest neighbors indices. """
        
        NVId, NVId_subsample = {}, [_ for _ in SNd][:1000] # {}, []
        
        """ Run KNN. """
        
        KNNd, finished = {}, []
        for ni in SNd:
            if ni in NVId_subsample:
                KNNd[ni], NVId[ni] = ni_knn(ni, run_NVId=True)
            else:
                KNNd[ni] = ni_knn(ni)
            finished.append(ni)

            runtime = f'{round( len(finished)*100/len(SNd), 2)}% in {round(( time.time() - t0 ) / 60 )} mins'
            print_log[data_name]['sublog'] = [f'   || KNN {runtime}']
            printer(print_log)
        del SNd
        
        with open(f'{path_4}NVId_{date}_{k_max}_knn.pkl','wb') as f: 
            pickle.dump(NVId, f)
        del NVId

        print_log[data_name]['sublog'] = []
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[data_name]['log'][-1] = f'  Step 2 | Run KNN by Neighborhood (Runtime: {runtime} mins)'
        printer(print_log)

        """ Step 3 | Saving """
        t0 = time.time()
        print_log[data_name]['log'].append('  Step 3 | Saving')
        printer(print_log)
        
        voters = pd.concat(KNNd)
        
        chunk_files = [chunk for chunk in os.listdir(path_1) if date+'_' in chunk]
        for chunk_file in chunk_files:
            print('    ', chunk_file)
            chunk = pd.read_pickle(path_1 + chunk_file)
            knn_chunk = voters.merge(chunk)
            
            chunk_number = chunk_file.strip('NC' + date)
            knn_chunk.to_pickle(path_4 + f'chunks/knn_{date}_{k_max}k{chunk_number}')
        
        """ Saving print_log """
        now = datetime.now()
        savedate = ''.join([str(now.year),str(now.strftime('%m')),str(now.strftime('%d'))])
        file = open(f'{path_4}print_log_{savedate}.txt', 'w')
        file.write(string_printer(print_log))
        file.close()

NC | 2000m
20101102 @ 10050k
  Done
20110101 @ 10050k
  Done
20121106 @ 10050k
  Done
20130101 @ 10050k
  Done
20141231 @ 10050k
  Done
20151103 @ 10050k
  Done
20161108 @ 10050k
  Done
20171107 @ 10050k
  Done
20181106 @ 10050k
  Done
20191105 @ 10050k
  Done
20201103 @ 10050k
  Step 1 | Proximity Matrix (Runtime: 0 mins)
  Step 2 | Run KNN by Neighborhood (Runtime: 1936 mins)
  Step 3 | Saving
     20201103_chunk_8.pkl
     20201103_chunk_9.pkl
     20201103_chunk_14.pkl
     20201103_chunk_2.pkl
     20201103_chunk_6.pkl
     20201103_chunk_0.pkl
     20201103_chunk_7.pkl
     20201103_chunk_12.pkl
     20201103_chunk_13.pkl
     20201103_chunk_10.pkl
     20201103_chunk_15.pkl
     20201103_chunk_3.pkl
     20201103_chunk_11.pkl
     20201103_chunk_4.pkl
     20201103_chunk_5.pkl
     20201103_chunk_1.pkl


#### I ran this after, since I didn't have the 2020 variables when running initially.

In [18]:
from data4_knn import *

date = '20201103'

chunk_files = [chunk for chunk in os.listdir(path_1) if date+'_' in chunk]
for chunk_file in chunk_files:
    print('    ', chunk_file)
    
    chunk = pd.read_pickle(path_1 + chunk_file)
    
    chunk_number = chunk_file.strip('NC' + date)
    knn_chunk = pd.read_pickle(path_4 + f'chunks/knn_{date}_{k_max}k{chunk_number}')

    knn_chunk = knn_chunk.merge(chunk)
    
    chunk_number = chunk_file.strip('NC' + date)
    knn_chunk.to_pickle(path_4 + f'chunks/knn_{date}_{k_max}k{chunk_number}')


     20201103_chunk_8.pkl
     20201103_chunk_9.pkl
     20201103_chunk_14.pkl
     20201103_chunk_2.pkl
     20201103_chunk_6.pkl
     20201103_chunk_0.pkl
     20201103_chunk_7.pkl
     20201103_chunk_12.pkl
     20201103_chunk_13.pkl
     20201103_chunk_10.pkl
     20201103_chunk_15.pkl
     20201103_chunk_3.pkl
     20201103_chunk_11.pkl
     20201103_chunk_4.pkl
     20201103_chunk_5.pkl
     20201103_chunk_1.pkl


## Analysis

Visualize a voter's knn map and summarize the data.