# joining the datasets house_to_join.csv and brookings_to_join.csv
- ### In this code, I merged the brookings_to_join.csv and house_to_join.csv together. The best case scenario was 633 rows, and the result was 621 rows. 
- Result was saved as brookings_538_merge.csv

pip install fuzzywuzzy python-Levenshtein

In [48]:
import numpy as np
import pandas as pd
%matplotlib inline
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path  
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from fuzzywuzzy import fuzz


## we can merge on 
- brookings['Primary.Outcome'] == house['General Status']
- brookings['District'] == house['District Abbrev']
- brookings['Candidate'] fuzzy match with house['Candidate'] with 60 percent match (this should be enough so long as the other two conditions are also matched)

### best case is 633 resulting rows. 

In [49]:
brookings = pd.read_csv('brookings_to_join.csv')
house = pd.read_csv('house_to_join.csv')

In [50]:
brookings['Primary.Outcome'].value_counts(dropna=False), house['General Status'].value_counts(dropna=False)

(0    627
 1    183
 Name: Primary.Outcome, dtype: int64,
 0    463
 1    170
 Name: General Status, dtype: int64)

In [51]:
def fuzzy_join(brookings, house, threshold=50):
    matched_pairs = []

    for index1, row1 in brookings.iterrows():
        for index2, row2 in house.iterrows():
            if (row1['District'] == row2['District Abbrev'] and 
                row1['Primary.Outcome'] == row2['General Status']):
                similarity_score = fuzz.ratio(row1['Candidate'].lower(), row2['Candidate'].lower())
                if similarity_score > threshold:
                    matched_pairs.append((index1, index2))

    # Joining matched rows
    matched_df = pd.DataFrame(columns=list(brookings.columns) + list(house.columns))
    for index1, index2 in matched_pairs:
        matched_row = pd.concat([brookings.iloc[[index1]].reset_index(drop=True), 
                                 house.iloc[[index2]].reset_index(drop=True)], axis=1)
        matched_df = matched_df.append(matched_row, ignore_index=True)

    return matched_df

In [52]:
new_df = fuzzy_join(brookings, house, threshold=70)

In [53]:
new_df.shape

(578, 86)

In [55]:
new_df.to_csv('brookings_538_merge.csv')

## Step 1: Go through all columns and take out ones that we are not interested in learning about
## Step 2: OHE the remainign dataset
## Step 3: Drop one column from each OHE variable
## Step 4: Use forward selection to model select

In [56]:
to_ohe = ['Candidate.Gender','Listed.military.service.',
  'Education',
  'Marital.Status',
  'Previous.Electoral.Experience',
  'Position.on.Affordable.Care.Act..ObamaCare.',
  'Position.on.Minimum.Wage',
  'Position.on.Federal.Taxes',
  'Position.on.Business.Regulations',
  'Position.on.National.Debt.Deficit',
  'Position.on.Social.Security',
  'Position.on.Gun.Control',
  'Position.on.Immigration',
  'Position.on.Abortion',
  'Position.on.Same.Sex.Marriage',
  'Position.on.Criminal.Justice.Reform',
  'Position.on.Federal.K.12.Education.Policy',
  'Position.on.Climate.Change',
  'Position.on.Campaign.Finance.Reform',
  'Position.on.Legalization.Decriminalization.of.Marijuana.Policy',
  'Position.on.Defense.Spending',
  'Position.on.Handling.Terrorism.Abroad',
  'Position.on.Russia',
  'Party.Category',
  'Trump.Mention',
  'Obama.Mention',
  'Sanders.Mention',
  'Clinton.Mention',
  'Special.Counsel.Mention',
  'Travel.Ban.Mention',
  'SinglePayer','Party Support?',
  'Emily Endorsed?',
  'Guns Sense Candidate?',
  'Biden Endorsed?',
  'Warren Endorsed? ',
  'Sanders Endorsed?',
  'Our Revolution Endorsed?',
  'Justice Dems Endorsed?',
  'PCCC Endorsed?',
  'Indivisible Endorsed?',
  'WFP Endorsed?',
  'VoteVets Endorsed?',
  'No Labels Support?']

In [57]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first')
encoder.fit(new_df[to_ohe])
encoded_data = encoder.transform(new_df[to_ohe]).toarray()


encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(to_ohe))


final_df = pd.concat([new_df, encoded_df], axis=1)
final_df.drop(to_ohe, axis=1, inplace=True)

In [58]:
final_df.to_csv('brookings_538_ohe.csv')

In [19]:
plot_this = pd.read_csv('ready for model building.csv')

In [47]:
# Pair plot to visualize correlations
# sns.pairplot(plot_this)
# plt.show()

# # Correlation matrix
# plot_this = pd.read_csv('ready for model building.csv')
correlation_matrix = plot_this.corr()
correlation_matrix

Unnamed: 0,General Status,Female,total_runners,Partisan Lean,Primary %,Race,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,...,PCCC Endorsed?_Yes,PCCC Endorsed?_nan,Indivisible Endorsed?_Yes,Indivisible Endorsed?_nan,WFP Endorsed?_Yes,WFP Endorsed?_nan,VoteVets Endorsed?_Yes,VoteVets Endorsed?_nan,No Labels Support?_Yes,No Labels Support?_nan
General Status,1.000000,0.262351,-0.212798,-0.122069,0.803230,-0.036827,-0.006360,-0.029316,0.090913,0.014290,...,0.168076,-0.003441,0.191482,-0.006690,0.096968,0.023109,0.119615,0.028374,-0.034099,0.007032
Female,0.262351,1.000000,0.018496,0.084285,0.256588,0.094434,-0.124771,0.056995,0.081073,-0.043787,...,0.073702,-0.056054,0.099271,-0.058246,0.058037,-0.008756,-0.001777,0.003667,-0.042046,0.009521
total_runners,-0.212798,0.018496,1.000000,0.503139,-0.500292,0.076013,-0.082409,0.028340,0.153354,0.159226,...,-0.006175,-0.031516,-0.012849,-0.096065,-0.003437,-0.095283,-0.040546,-0.087386,0.052762,-0.135819
Partisan Lean,-0.122069,0.084285,0.503139,1.000000,-0.268482,0.312052,-0.060830,-0.005460,0.283610,0.139214,...,0.116806,-0.304732,-0.058844,0.035420,0.037799,-0.108655,0.040721,-0.112340,0.059043,-0.182343
Primary %,0.803230,0.256588,-0.500292,-0.268482,1.000000,-0.066734,0.041796,-0.026486,0.102001,-0.005847,...,0.124390,0.004632,0.203213,0.007569,0.097705,0.031914,0.182873,0.051203,0.003796,0.063511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WFP Endorsed?_nan,0.023109,-0.008756,-0.095283,-0.108655,0.031914,-0.049349,-0.002914,0.022337,-0.099385,0.047075,...,-0.123902,0.082977,0.036120,-0.055075,-0.482420,1.000000,-0.094175,0.171342,-0.073933,0.209040
VoteVets Endorsed?_Yes,0.119615,-0.001777,-0.040546,0.040721,0.182873,0.011612,0.484662,-0.003330,0.042896,-0.001623,...,-0.027617,-0.047733,0.025457,-0.054850,0.123465,-0.094175,1.000000,-0.484662,0.135422,-0.026917
VoteVets Endorsed?_nan,0.028374,0.003667,-0.087386,-0.112340,0.051203,-0.052452,-0.184062,0.024137,0.005359,-0.143255,...,-0.016208,0.017640,-0.042092,0.073527,-0.102049,0.171342,-0.484662,1.000000,-0.055953,0.127945
No Labels Support?_Yes,-0.034099,-0.042046,0.052762,0.059043,0.003796,0.038474,0.055953,-0.012789,0.076542,-0.012529,...,-0.007819,0.015850,-0.014265,0.028407,-0.009942,-0.073933,0.135422,-0.055953,1.000000,-0.388469


In [46]:
correlation_df = pd.DataFrame(correlation_matrix.iloc[0])
correlation_df

Unnamed: 0,General Status
General Status,1.000000
Female,0.262351
total_runners,-0.212798
Partisan Lean,-0.122069
Primary %,0.803230
...,...
WFP Endorsed?_nan,0.023109
VoteVets Endorsed?_Yes,0.119615
VoteVets Endorsed?_nan,0.028374
No Labels Support?_Yes,-0.034099


In [37]:
correlation_df = correlation_df.abs().sort_values("General Status")
#Dataframe.sort_values("Column to sort by", ascending=True)
#correlation_df['General Status'] = correlation_df['General Status']

In [64]:
correlation_df = correlation_df.sort_values("General Status",ascending=False)

In [65]:
correlation_df.head(20)

Unnamed: 0,General Status
General Status,1.0
Primary %,0.80323
Party Support?_Yes,0.352727
Female,0.262351
Emily Endorsed?_Yes,0.247194
Indivisible Endorsed?_Yes,0.191482
Biden Endorsed?_Yes,0.191333
Guns Sense Candidate?_Yes,0.180476
PCCC Endorsed?_Yes,0.168076
Warren Endorsed? _Yes,0.144256


In [67]:
#correlation_df = correlation_df.drop(index = ['Candidate.Gender_Male','total_runners.1'])
#index = ["row label"]

In [68]:
correlation_df.head(30)

Unnamed: 0,General Status
General Status,1.0
Primary %,0.80323
Party Support?_Yes,0.352727
Female,0.262351
Emily Endorsed?_Yes,0.247194
Indivisible Endorsed?_Yes,0.191482
Biden Endorsed?_Yes,0.191333
Guns Sense Candidate?_Yes,0.180476
PCCC Endorsed?_Yes,0.168076
Warren Endorsed? _Yes,0.144256
