In [4]:
import warnings
warnings.simplefilter(action='ignore')

from sklearn import linear_model
from sklearn.utils import all_estimators
from collections import Counter
from zipfile import ZipFile
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pandas as pd
import os
import pathlib
import io
import re

In [5]:
#[(getattr(linear_model, model), type(getattr(linear_model, model))) for model in dir(linear_model)]
#PARAMETERS:
load_in = False
out_file ='full_with_headlines.csv'
Vec_sw = 'english'
Vec_max_f = 10000
Vec_ngr = (1,1)
candidate_list_split = " ~ "

In [54]:
#HELPERS
def read_zip(zip_fn, extract_fn=None):
    zf = ZipFile(zip_fn)
    if extract_fn:
        return zf.read(extract_fn)
    else:
        return [zf.read(name) for name in zf.namelist()][0]

def preprocess_headlines(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()

    text = re.sub(r'\d+', '', text)
    return text
    
def name_norm(n):
    return re.sub(r'[^A-Za-z0-9 ]+','',n)


In [7]:
SK_ALL = all_estimators(type_filter=['classifier'],)#'regressor'
names = [sk[0] for sk in SK_ALL if 'dummy' not in sk[0]]
modls = [sk[1] for sk in SK_ALL if 'dummy' not in sk[0]]
#names len(names)

In [85]:
dated_df = pd.read_excel("Elections/ElectionDates.xlsx")
dated_df = dated_df[['Year','Date']]
dated_df['Date'] = pd.to_datetime(dated_df.Date).dt.date
dated = dict(zip(dated_df.Year, dated_df.Date))

def fetch_preprocess_targets(f, dated=dated):
    def perc(row):
        if row['totalvotes'] == 0:
            return 0
        return row['candidatevotes'] / row['totalvotes']

    df = pd.read_csv(f,encoding = "ISO-8859-1")
    if 'president' in f:
        df['district'] = ['nationwide']*len(df)
    
        
    df['percentvote'] = df.apply(lambda row: perc(row), axis=1)

    df['date'] = df.apply(lambda row: dated[row['year']], axis=1)
    
    df = df[df['candidate'].notna()]
    df['candidate_norm'] = df['candidate'].apply(lambda n: name_norm(n))
    
    if 'party_detailed' in df.columns:
        df['party'] = df['party_detailed']
    
    if 'runoff' in df.columns:
        df = df[df['runoff'] ==False]
    
    if 'special' in df.columns:
        df = df[df['special'] ==False] 
    
    if 'stage' in df.columns:
        df = df[df['stage'].str.lower() == 'gen']
    
    df = df.drop(['version',
                  'notes',
                  'writein',
                  'unofficial',
                  'party_detailed',
                  'party_simplified', 
                  'runoff',
                  'special',
                  'mode',
                  'stage',
                  'fusion_ticket'], axis=1, errors='ignore')    #state_fips	state_cen	state_ic?
    
    df = df[~(df['year'] < 1999)]  

    df = df.sort_values(['year',
                         'state',
                         'district',
                         'totalvotes',
                         'percentvote'],ascending=False).groupby(['year',
                                                                  'state',
                                                                  'district',
                                                                  'totalvotes'])#.head(2).reset_index(drop=True) 
    
    grouped_df = []
    for name, group in df:
        tmp = group.head(2)
        if tmp['percentvote'].sum() >= 0.70:
            pass
        else:
            tmp = group.head(3)

        tmp['candidates'] = [candidate_list_split.join(list(tmp['candidate'].astype('string')))] * len(tmp)
        grouped_df.append(tmp)
    
    df = pd.concat(grouped_df)
    
    #df = df[df['percentvote'] >= 0.15]
    #df = df[df['percentvote'] <= 0.85]
    return df

In [86]:
fails = []
def merge_news_elections(election_news, elections_source):
    elections_source['headlines'] = [""] * len(elections_source)
    elections_source['file'] = [""] * len(elections_source)
    elections_source = elections_source.reset_index(drop=True)
    
    #news_df = pd.DataFrame(columns=['candidate','file','headlines','date'])

    for df in election_news:
        candidate = df.loc[0,'candidate']
        file_c = df.loc[0,'file']
        date = file_c.split('_')[-1][:-4] #drop .ZIP
        year = int(date[-4:])    #grab last 4 of date
        headlines = list(set([preprocess_headlines(text) for text in df['Headline']]))
        bagofwords = '    '.join(headlines)
        e_idx = race_matching(candidate, year, elections_source, headlines)
        if e_idx != -1:
            elections_source.at[e_idx,'headlines']  = bagofwords
            elections_source.at[e_idx,'file']       = file_c
        else:
            fails.append(df)
    print('FAILS',len(fails))
    return elections_source, fails

In [92]:
def read_elections():
    elections = []
    for fil in ['1976-2020-president.csv','1976-2020-senate.csv','1976-2020-house.csv']:
        elections.append(fetch_preprocess_targets('Elections/'+fil, dated))
    return pd.concat(elections)

def read_news():
    this_directory = pathlib.Path().absolute()
    read_path = os.path.join(str(this_directory),'NewsData')
    news_files = [(os.path.join(read_path, f), str(f).split('_')[0]) for f in os.listdir(read_path)]
    return_dfs = []
    for a,b in news_files:
        if "BLANK VOTE" not in b:
            subfil = read_zip(a)
            return_dfs.append(pd.read_excel(io.BytesIO(subfil)).assign(file=a,candidate=b))
    return return_dfs

def masking_names(row):
    target = re.sub(r'[^\w\s]', '', row['candidate'])
    opponent = re.sub(r'[^\w\s]', '', row['opponents'])
    text = re.sub(r'[^\w\s]', '', row['headlines'])

    text = text.lower()
    opponent = opponent.lower()
    target = target.lower()

    text = re.sub(r'\d+', '', text)

    for wd in target.split(' '):
        tmp = [t if t != wd and t != wd+"s" else 'thecandidate' for t in text.split(' ')]
        text = " ".join(tmp)
        
    for wd in opponent.split(' '):
        tmp = [t if t != wd and t != wd+"s" else 'theopponent' for t in text.split(' ')]
        text = " ".join(tmp)
        
    return text


def race_matching(candidate0, year, source, headlines):
    def candidate_matcher(c, y, src, pres, state, col):
        if c in pres or name_norm(c) in pres:
            tmp = src.index[(src[col] == c) & (src['state'] == state) & (src['year'] == y)]
            if len(tmp) == 1:
                return tmp[0]
        else:
            tmp = src.index[(src[col] == c) & (src['year'] == y)]
            if len(tmp) == 1:
                return tmp[0]
        return None
    
    candidates = source['candidate'].unique()
    presidents = source[source['office'] == "US PRESIDENT"]['candidate'].unique()
    states = source['state'].unique()

    state = None
    for st in states:
        if st in candidate0:
            state = st
            break
    
    if state:
        candidate0 = re.sub(state,"", candidate0)
        
    r = candidate_matcher(candidate0, year, source, presidents, state,'candidate')
    if r:
        return r

    r = candidate_matcher(candidate0, year, source, presidents, state, 'candidate_norm')
    if r:
        return r
    
    return -1
    
def fill_opponent(row):
    tmp =" "
    others = row['candidates'].split(candidate_list_split)
    for nm in others:
        if nm != row['candidate']:
            tmp = tmp + nm
    return tmp

def vocabulary(df):
    all_docs = df['headlines']
    pass

def bow_plus(df): #+state and party
    def party_adjust(r):
        if 'DEMOC' in r:
            return 1.0
        elif "REPUB" in r:
            return 0.0
        else: 
            return -1.0

    print(len(df['party']))
    docs = df['headlines']
    df['party'] =df['party'].fillna('-1')
    parties = df['party'].apply(lambda r: party_adjust(r))
    states = df['state'].unique()
    print(Counter(parties))

    CountVec = TfidfVectorizer(use_idf=True, 
                               smooth_idf=False,
                               ngram_range=Vec_ngr, # to use bigrams ngram_range=(2,2)
                               stop_words=Vec_sw,
                               max_features=Vec_max_f)
        
    Count_data = CountVec.fit_transform(docs)

    cvdf = pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
    cvdf['PARTYX'] = parties
    cvdf['PARTYX'].fillna(-1,inplace=True)
    return cvdf

def model_initializer():
    n_estimators = ""
    learning_rate= ""
    pass

def train_test_model():#fit, predict, score
    pass

In [93]:
#Counter([len(g) for n,g in elections_df.groupby(['year','state','district','totalvotes'])])

In [30]:
if not load_in:
    elections_df = read_elections()
    election_news = read_news()    

In [88]:
if not load_in:
    news_df, fails_df = merge_news_elections(election_news, elections_df)
    news_df.to_csv(out_file)
else:
    news_df = pd.read_csv(out_file)
news_df

FAILS 613


Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,candidate,candidatevotes,totalvotes,district,percentvote,date,candidate_norm,party,candidates,headlines,file
0,2000,ALABAMA,AL,1,63,41,US PRESIDENT,"BUSH, GEORGE W.",941173,1666272,nationwide,0.564838,2000-11-07,BUSH GEORGE W,REPUBLICAN,"BUSH, GEORGE W. ~ GORE, AL",,
1,2000,ALABAMA,AL,1,63,41,US PRESIDENT,"GORE, AL",692611,1666272,nationwide,0.415665,2000-11-07,GORE AL,DEMOCRAT,"BUSH, GEORGE W. ~ GORE, AL",,
2,2000,ALASKA,AK,2,94,81,US PRESIDENT,"BUSH, GEORGE W.",167398,285560,nationwide,0.586210,2000-11-07,BUSH GEORGE W,REPUBLICAN,"BUSH, GEORGE W. ~ GORE, AL",,
3,2000,ALASKA,AK,2,94,81,US PRESIDENT,"GORE, AL",79004,285560,nationwide,0.276663,2000-11-07,GORE AL,DEMOCRAT,"BUSH, GEORGE W. ~ GORE, AL",,
4,2000,ARIZONA,AZ,4,86,61,US PRESIDENT,"BUSH, GEORGE W.",781652,1532016,nationwide,0.510211,2000-11-07,BUSH GEORGE W,REPUBLICAN,"BUSH, GEORGE W. ~ GORE, AL",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,2020,WISCONSIN,WI,55,35,25,US HOUSE,TRICIA ZUNKER,162741,415007,7,0.392140,2020-11-03,TRICIA ZUNKER,DEMOCRAT,THOMAS P TIFFANY ~ TRICIA ZUNKER,tiffany doesnt condemn qanon house resolution ...,d:\FALL2022\capstone\NewsData\TRICIA ZUNKER_11...
5527,2020,WISCONSIN,WI,55,35,25,US HOUSE,MIKE GALLAGHER,268173,417838,8,0.641811,2020-11-03,MIKE GALLAGHER,REPUBLICAN,MIKE GALLAGHER ~ AMANDA STUCK,cortez masto scott introduce bill to connect m...,d:\FALL2022\capstone\NewsData\MIKE GALLAGHER_1...
5528,2020,WISCONSIN,WI,55,35,25,US HOUSE,AMANDA STUCK,149558,417838,8,0.357933,2020-11-03,AMANDA STUCK,DEMOCRAT,MIKE GALLAGHER ~ AMANDA STUCK,wisconsin dems continues whats at stake gotv b...,d:\FALL2022\capstone\NewsData\AMANDA STUCK_11-...
5529,2020,WYOMING,WY,56,83,68,US HOUSE,LIZ CHENEY,185732,278503,0,0.666894,2020-11-03,LIZ CHENEY,REPUBLICAN,LIZ CHENEY ~ LYNNETTE GREY BULL,cheney honors wyoming s veterans at cheyenne n...,d:\FALL2022\capstone\NewsData\LIZ CHENEY_11-3-...


In [94]:
whatwehave = news_df[(news_df['headlines'].notna()) & (news_df['headlines'] != "")]
whatwehave['opponents'] = whatwehave.apply(lambda x: fill_opponent(x), axis=1)
whatwehave['headlines'] = whatwehave.apply(lambda x: masking_names(x), axis=1)
#whatwehave.groupby(['year','state','district','totalvotes'])
whatwehave['winner'] = whatwehave.groupby(['year','state','district','totalvotes'], sort=False)['candidatevotes'].transform(max)
whatwehave['winner'] = whatwehave['winner'] == whatwehave['candidatevotes']
whatwehave

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,candidate,candidatevotes,totalvotes,district,percentvote,date,candidate_norm,party,candidates,headlines,file,opponents,winner
612,2000,ARIZONA,AZ,4,86,61,US SENATE,JON KYL,1108196,1397076,statewide,0.793225,2000-11-07,JON KYL,REPUBLICAN,JON KYL ~ WILLIAM TOEL,thecandidate criticizes use of senate floor fo...,d:\FALL2022\capstone\NewsData\JON KYL_11-7-200...,WILLIAM TOEL,True
614,2000,CALIFORNIA,CA,6,93,71,US SENATE,DIANNE FEINSTEIN,5932522,10623608,statewide,0.558428,2000-11-07,DIANNE FEINSTEIN,DEMOCRAT,DIANNE FEINSTEIN ~ TOM CAMPBELL,court rules for oakland mayor brown in suit ov...,d:\FALL2022\capstone\NewsData\DIANNE FEINSTEIN...,TOM CAMPBELL,True
615,2000,CALIFORNIA,CA,6,93,71,US SENATE,TOM CAMPBELL,3886853,10623608,statewide,0.365869,2000-11-07,TOM CAMPBELL,REPUBLICAN,DIANNE FEINSTEIN ~ TOM CAMPBELL,theopponent close calif races will help decide...,d:\FALL2022\capstone\NewsData\TOM CAMPBELL_11-...,DIANNE FEINSTEIN,False
616,2000,CONNECTICUT,CT,9,16,1,US SENATE,JOSEPH I. LIEBERMAN,828902,1311261,statewide,0.632141,2000-11-07,JOSEPH I LIEBERMAN,DEMOCRAT,JOSEPH I. LIEBERMAN ~ PHIL GIORDANO,thecandidate cheats connecticut by running for...,d:\FALL2022\capstone\NewsData\JOSEPH I. LIEBER...,PHIL GIORDANO,True
617,2000,CONNECTICUT,CT,9,16,1,US SENATE,PHIL GIORDANO,448077,1311261,statewide,0.341715,2000-11-07,PHIL GIORDANO,REPUBLICAN,JOSEPH I. LIEBERMAN ~ PHIL GIORDANO,rowland says johnson theopponent at the top of...,d:\FALL2022\capstone\NewsData\PHIL GIORDANO_11...,JOSEPH I. LIEBERMAN,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,2020,WISCONSIN,WI,55,35,25,US HOUSE,TRICIA ZUNKER,162741,415007,7,0.392140,2020-11-03,TRICIA ZUNKER,DEMOCRAT,THOMAS P TIFFANY ~ TRICIA ZUNKER,theopponent doesnt condemn qanon house resolut...,d:\FALL2022\capstone\NewsData\TRICIA ZUNKER_11...,THOMAS P TIFFANY,True
5527,2020,WISCONSIN,WI,55,35,25,US HOUSE,MIKE GALLAGHER,268173,417838,8,0.641811,2020-11-03,MIKE GALLAGHER,REPUBLICAN,MIKE GALLAGHER ~ AMANDA STUCK,cortez masto scott introduce bill to connect m...,d:\FALL2022\capstone\NewsData\MIKE GALLAGHER_1...,AMANDA STUCK,True
5528,2020,WISCONSIN,WI,55,35,25,US HOUSE,AMANDA STUCK,149558,417838,8,0.357933,2020-11-03,AMANDA STUCK,DEMOCRAT,MIKE GALLAGHER ~ AMANDA STUCK,wisconsin dems continues whats at stake gotv b...,d:\FALL2022\capstone\NewsData\AMANDA STUCK_11-...,MIKE GALLAGHER,False
5529,2020,WYOMING,WY,56,83,68,US HOUSE,LIZ CHENEY,185732,278503,0,0.666894,2020-11-03,LIZ CHENEY,REPUBLICAN,LIZ CHENEY ~ LYNNETTE GREY BULL,thecandidate honors wyoming theopponent vetera...,d:\FALL2022\capstone\NewsData\LIZ CHENEY_11-3-...,LYNNETTE GREY BULL,True


In [106]:
X = bow_plus(whatwehave)
y = whatwehave['winner']
vocab = list(X.columns)
X

4059
Counter({1.0: 2062, 0.0: 1938, -1.0: 59})


Unnamed: 0,aan,aarp,ab,aba,abandon,abandoned,abandoning,abandons,abbott,abby,...,zoom,zu,zum,zur,zvsan,zwei,élections,étatsunis,über,PARTYX
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0


In [96]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y, test_size=0.25,shuffle=True)

In [98]:
res_book = []
for i in range(len(modls)):
    name = names[i]
    try:
        model = modls[i]()
        model.fit(X=X_train, y=y_train)
        preds = model.predict(X_test)
        res_book.append((name,model.score(X_test, y_test),model))
        print((name,model.score(X_test, y_test)))
    except Exception as e:
        #print(name, 'FAIL', str(e)[:100])
        pass

('AdaBoostClassifier', 0.7605911330049261)
('BaggingClassifier', 0.7704433497536946)
('BernoulliNB', 0.6285714285714286)
('CalibratedClassifierCV', 0.7911330049261084)
('DecisionTreeClassifier', 0.7073891625615764)
('DummyClassifier', 0.5684729064039409)
('ExtraTreeClassifier', 0.6532019704433497)
('ExtraTreesClassifier', 0.7921182266009852)
('GaussianNB', 0.6857142857142857)
('GaussianProcessClassifier', 0.7970443349753694)
('GradientBoostingClassifier', 0.7832512315270936)
('HistGradientBoostingClassifier', 0.8)
('KNeighborsClassifier', 0.6147783251231527)
('LabelPropagation', 0.6344827586206897)
('LabelSpreading', 0.6413793103448275)
('LinearDiscriminantAnalysis', 0.6216748768472906)
('LinearSVC', 0.7921182266009852)
('LogisticRegression', 0.7980295566502463)
('LogisticRegressionCV', 0.7960591133004926)
('MLPClassifier', 0.7438423645320197)
('NearestCentroid', 0.6275862068965518)
('NuSVC', 0.7852216748768472)
('PassiveAggressiveClassifier', 0.7625615763546798)
('Perceptron', 0.74778

In [100]:
res_book2 = [r for r in res_book if r[1] >= 0.75]
print(*res_book2, sep='\n')

('AdaBoostClassifier', 0.7605911330049261, AdaBoostClassifier())
('BaggingClassifier', 0.7704433497536946, BaggingClassifier())
('CalibratedClassifierCV', 0.7911330049261084, CalibratedClassifierCV())
('ExtraTreesClassifier', 0.7921182266009852, ExtraTreesClassifier())
('GaussianProcessClassifier', 0.7970443349753694, GaussianProcessClassifier())
('GradientBoostingClassifier', 0.7832512315270936, GradientBoostingClassifier())
('HistGradientBoostingClassifier', 0.8, HistGradientBoostingClassifier())
('LinearSVC', 0.7921182266009852, LinearSVC())
('LogisticRegression', 0.7980295566502463, LogisticRegression())
('LogisticRegressionCV', 0.7960591133004926, LogisticRegressionCV())
('NuSVC', 0.7852216748768472, NuSVC())
('PassiveAggressiveClassifier', 0.7625615763546798, PassiveAggressiveClassifier())
('RandomForestClassifier', 0.7931034482758621, RandomForestClassifier())
('RidgeClassifier', 0.7921182266009852, RidgeClassifier())
('RidgeClassifierCV', 0.7921182266009852, RidgeClassifierCV()

In [112]:
def test_best(m):
    preds = m.predict(X_test)
    print(classification_report(preds,y_test))
test_best(res_book2[6][2])

              precision    recall  f1-score   support

       False       0.80      0.75      0.78       465
        True       0.80      0.84      0.82       550

    accuracy                           0.80      1015
   macro avg       0.80      0.80      0.80      1015
weighted avg       0.80      0.80      0.80      1015



In [108]:
import pickle
with open('pickled//trained.mods','wb') as f:
    pickle.dump(res_book, f)
with open('pickled//bestmodel.svc','wb') as f:
    pickle.dump(res_book2[16], f)
with open('pickled//vocab.pickle','wb') as f:
    pickle.dump(vocab, f)