In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, spacy

import string
from pandarallel import pandarallel


In [36]:
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

train_df

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [37]:
nlp = spacy.load('en_core_web_sm')

In [38]:
idx = 0
fp = train_df['first_party'][idx]
sp = train_df['second_party'][idx]
facts = train_df['facts'][idx]
fact_tokens = nlp(facts)

print('first_party: ', fp)
print('second_party: ', sp)
print('facts: ', facts)


first_party:  Phil A. St. Amant
second_party:  Herman A. Thompson
facts:  On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.  During this speech, St. Amant accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union.  Finally, St. Amant implicated Herman Thompson, an East Baton Rouge deputy sheriff, in a scheme to move money between the Teamsters Union and St. Amant’s political opponent. 
Thompson successfully sued St. Amant for defamation.  Louisiana’s First Circuit Court of Appeals reversed, holding that Thompson did not show St. Amant acted with “malice.”  Thompson then appealed to the Supreme Court of Louisiana.  That court held that, although public figures forfeit some of their First Amendment protection from defamation, St. Amant accused Thompson of a crime with utter disregard of whether the remarks were true.  Finally, that court hel

In [39]:
legal_verbs = {'argue': 740,
               'violate': 651,
               'sue': 526,
               'allege': 468,
               'convict': 436,
               'claim': 393,
               'accuse': 391,
               'implicate': 387,}

In [40]:
def inspect_token(fact_tokens):
    fp_name = 'The First Party'
    sp_name = 'The Second Party'

    fp_name_list = [n.lower() for n in fp.replace(',', '').split() if len(n) > 1]
    sp_name_list = [n.lower() for n in sp.replace(',', '').split() if len(n) > 1]

    fp_name_list_added = fp_name_list.copy()
    sp_name_list_added = sp_name_list.copy()

    for n in fp_name_list:
        changed_name = re.findall(rf"{n} ?\([a-z]+\)", fact_tokens.text.lower())
        if changed_name:
            fp_name_list_added.extend([re.sub(rf'({n}|[ {string.punctuation}])', '', cn) for cn in changed_name])

    for n in sp_name_list:
        changed_name = re.findall(rf"{n} ?\([a-z]+\)", fact_tokens.text.lower())
        if changed_name:
            sp_name_list_added.extend([re.sub(rf'({n}|[ {string.punctuation}])', '', cn) for cn in changed_name])

    fp_name_list = fp_name_list_added
    sp_name_list = sp_name_list_added

    def is_in_namelist(token):
        if 'NN' in token.tag_ or 'PROPN' in token.pos_:
            if token.text.lower() in fp_name_list:
                return fp_name
            elif token.text.lower() in sp_name_list:
                return sp_name
        return token.text

    def is_has_legal_verb(token, strong=False):
        if 'NN' in token.tag_ or 'PROPN' in token.pos_:
            head = token.head
            if head.text.lower() in fp_name_list:
                if token.pos_ == 'PROPN':
                    fp_name_list.append(token.text.lower())
                if strong:
                    return fp_name
            elif head.text.lower() in sp_name_list:
                if token.pos_ == 'PROPN':
                    sp_name_list.append(token.text.lower())
                if strong:
                    return sp_name
            elif 'VBD' in head.tag_ and head.lemma_ in legal_verbs:
                return is_in_namelist(token)
        return token.text

    before_len_fp = 0
    before_len_sp = 0

    fact_subj = [token.text for token in fact_tokens]

    while before_len_fp != len(fp_name_list) or before_len_sp != len(sp_name_list):
        for i, token in enumerate(reversed(fact_tokens)):
            party = is_in_namelist(token)
            if not party:
                party = is_has_legal_verb(token)
            fact_subj[len(fact_tokens) - i - 1] = party

        before_len_fp = len(fp_name_list)
        before_len_sp = len(sp_name_list)


    fact_subj = ' '.join(fact_subj)
    fact_subj = re.sub(rf'[{string.punctuation}]+', r' ', fact_subj)
    fact_subj = re.sub(rf"[\d\s]+", r' ', fact_subj)
    fact_subj = re.sub(rf"({fp_name} ?)+", f'{fp_name} ', fact_subj)
    fact_subj = re.sub(rf"({sp_name} ?)+", f'{sp_name} ', fact_subj)
    return fact_subj

('file', 895),
 ('hold', 820),
 ('argue', 740),
 ('violate', 651),
 ('find', 627),
 ('sue', 526),
 ('deny', 522),
 ('allege', 468),
 ('have', 456),
 ('convict', 436),
 ('affirm', 434),
 ('appeal', 418),
 ('claim', 393),
 ('grant', 390),
 ('seek', 361),
 ('require', 356),
 ('rule', 342),
 ('dismiss', 331),
 ('reverse', 319),
 ('make', 293),

In [41]:
inspect_token(fact_tokens)

'On June The First Party a candidate for public office made a television speech in Baton Rouge Louisiana During this speech The First Party accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union Finally The First Party implicated The Second Party an East Baton Rouge deputy sheriff in a scheme to move money between the Teamsters Union and The First Party ’s political opponent The Second Party successfully sued The First Party for defamation Louisiana ’s First Circuit Court of Appeals reversed holding that The Second Party did not show The First Party acted with “ malice ” The Second Party then appealed to the Supreme Court of Louisiana That court held that although public figures forfeit some of their First Amendment protection from defamation The First Party accused The Second Party of a crime with utter disregard of whether the remarks were true Finally that court held that the First Amendment protect

In [42]:
import re

def mask_text(text, words_to_mask):
    MASK_TOKEN = '[MASK]'
    for phrase in words_to_mask:
        # Escape special regex characters
        phrase = re.escape(phrase)
        text = re.sub(phrase, MASK_TOKEN, text, flags=re.IGNORECASE)
    return text

In [43]:
words_to_mask = ['The First Party', 'The Second Party']
print(inspect_token(fact_tokens))
masked = mask_text(inspect_token(fact_tokens), words_to_mask)
print(masked)

On June The First Party a candidate for public office made a television speech in Baton Rouge Louisiana During this speech The First Party accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union Finally The First Party implicated The Second Party an East Baton Rouge deputy sheriff in a scheme to move money between the Teamsters Union and The First Party ’s political opponent The Second Party successfully sued The First Party for defamation Louisiana ’s First Circuit Court of Appeals reversed holding that The Second Party did not show The First Party acted with “ malice ” The Second Party then appealed to the Supreme Court of Louisiana That court held that although public figures forfeit some of their First Amendment protection from defamation The First Party accused The Second Party of a crime with utter disregard of whether the remarks were true Finally that court held that the First Amendment protects

In [44]:
column_rename = {'first_party': 'fp',
                 'second_party': 'sp',
                 'first_party_winner': 'label'}

train_df.rename(columns=column_rename, inplace=True)
test_df.rename(columns=column_rename, inplace=True)

In [45]:
aug_df = pd.DataFrame({'ID': train_df['ID'],
                       'fp': train_df['sp'],
                       'sp': train_df['fp'],
                       'facts': train_df['facts'],
                       'label': 1-train_df['label']})


train_df = pd.concat([train_df, aug_df], ignore_index=True)
train_df

Unnamed: 0,ID,fp,sp,facts,label
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
4951,TRAIN_2473,"Renewable Fuels Association, et al.","HollyFrontier Cheyenne Refining, LLC, et al.",Congress amended the Clean Air Act through the...,0
4952,TRAIN_2474,"Alliance Bond Fund, Inc.","Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc., an investment fund, ...",0
4953,TRAIN_2475,United States,Peguero,"In 1992, the District Court sentenced Manuel D...",1
4954,TRAIN_2476,St. Cyr,Immigration and Naturalization Service,"On March 8, 1996, Enrico St. Cyr, a lawful per...",1


In [46]:
# train_df['fp'] = train_df['fp'].str.lower()
# train_df['sp'] = train_df['sp'].str.lower()
# train_df['facts'] = train_df['facts'].str.lower()

# test_df['fp'] = test_df['fp'].str.lower()
# test_df['sp'] = test_df['sp'].str.lower()
# test_df['facts'] = test_df['facts'].str.lower()

In [47]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a test of the tokenize function.")
doc

This is a test of the tokenize function.

In [48]:
import string

nlp = spacy.load("en_core_web_sm")

In [49]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)
train_df['new_facts'] = train_df.parallel_apply(lambda x: inspect_token(nlp(x['facts'])), axis=1)

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=78), Label(value='0 / 78'))), HBox…

In [34]:
train_df

Unnamed: 0,ID,fp,sp,facts,label,new_facts
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1,On June The First Party a candidate for public...
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0,Ramon Nelson was riding his bike when he suffe...
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1,An Alabama state court convicted Billy Joe Mag...
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0,Victor Linkletter was convicted in state court...
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1,On April in Selma Alabama an intruder broke in...
...,...,...,...,...,...,...
4951,TRAIN_2473,"Renewable Fuels Association, et al.","HollyFrontier Cheyenne Refining, LLC, et al.",Congress amended the Clean Air Act through the...,0,Congress amended the Clean Air Act through the...
4952,TRAIN_2474,"Alliance Bond Fund, Inc.","Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc., an investment fund, ...",0,Alliance Bond Fund Inc an investment fund purc...
4953,TRAIN_2475,United States,Peguero,"In 1992, the District Court sentenced Manuel D...",1,In the District Court sentenced Manuel D Pegue...
4954,TRAIN_2476,St. Cyr,Immigration and Naturalization Service,"On March 8, 1996, Enrico St. Cyr, a lawful per...",1,On March Enrico The First Party Cyr a lawful p...


In [50]:
pandarallel.initialize(progress_bar=True)
train_df['masked_facts'] = train_df['new_facts'].parallel_apply(lambda x: mask_text(x, words_to_mask))

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=78), Label(value='0 / 78'))), HBox…

In [51]:
print(train_df['new_facts'][0])
print(train_df['masked_facts'][0])

On June The First Party a candidate for public office made a television speech in Baton Rouge Louisiana During this speech The First Party accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union Finally The First Party implicated The Second Party an East Baton Rouge deputy sheriff in a scheme to move money between the Teamsters Union and The First Party ’s political opponent The Second Party successfully sued The First Party for defamation Louisiana ’s First Circuit Court of Appeals reversed holding that The Second Party did not show The First Party acted with “ malice ” The Second Party then appealed to the Supreme Court of Louisiana That court held that although public figures forfeit some of their First Amendment protection from defamation The First Party accused The Second Party of a crime with utter disregard of whether the remarks were true Finally that court held that the First Amendment protects

In [52]:
pandarallel.initialize(progress_bar=True)
test_df['new_facts'] = test_df.parallel_apply(lambda x: inspect_token(nlp(x['facts'])), axis=1)

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20), Label(value='0 / 20'))), HBox…

In [53]:
pandarallel.initialize(progress_bar=True)
test_df['masked_facts'] = test_df['new_facts'].parallel_apply(lambda x: mask_text(x, words_to_mask))

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20), Label(value='0 / 20'))), HBox…

In [57]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_df['tokenized_text'] = train_df['masked_facts'].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=512))
train_df['decoded_text'] = train_df['tokenized_text'].apply(lambda x: tokenizer.decode(x))

# Print the original, masked, and decoded text for the first row
print("Original text:", train_df['facts'].iloc[0])
print("Party Encoded text:", train_df['new_facts'].iloc[0])
print("Masked text:", train_df['masked_facts'].iloc[0])
print("Decoded text:", train_df['decoded_text'].iloc[0])

Original text: On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.  During this speech, St. Amant accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union.  Finally, St. Amant implicated Herman Thompson, an East Baton Rouge deputy sheriff, in a scheme to move money between the Teamsters Union and St. Amant’s political opponent. 
Thompson successfully sued St. Amant for defamation.  Louisiana’s First Circuit Court of Appeals reversed, holding that Thompson did not show St. Amant acted with “malice.”  Thompson then appealed to the Supreme Court of Louisiana.  That court held that, although public figures forfeit some of their First Amendment protection from defamation, St. Amant accused Thompson of a crime with utter disregard of whether the remarks were true.  Finally, that court held that the First Amendment protects uninhibited, robust deb

In [73]:
train_df

Unnamed: 0,ID,fp,sp,facts,label,new_facts,masked_facts,tokenized_text,decoded_text
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1,On June The First Party a candidate for public...,On June [MASK] a candidate for public office m...,"[101, 2006, 2238, 103, 1037, 4018, 2005, 2270,...",[CLS] on june [MASK] a candidate for public of...
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0,Ramon Nelson was riding his bike when he suffe...,Ramon Nelson was riding his bike when he suffe...,"[101, 12716, 5912, 2001, 5559, 2010, 7997, 204...",[CLS] ramon nelson was riding his bike when he...
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1,An Alabama state court convicted Billy Joe Mag...,An Alabama state court convicted Billy Joe Mag...,"[101, 2019, 6041, 2110, 2457, 7979, 5006, 3533...",[CLS] an alabama state court convicted billy j...
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0,Victor Linkletter was convicted in state court...,Victor Linkletter was convicted in state court...,"[101, 5125, 4957, 27901, 2099, 2001, 7979, 199...",[CLS] victor linkletter was convicted in state...
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1,On April in Selma Alabama an intruder broke in...,On April in Selma Alabama an intruder broke in...,"[101, 2006, 2258, 1999, 28112, 6041, 2019, 228...",[CLS] on april in selma alabama an intruder br...
...,...,...,...,...,...,...,...,...,...
4951,TRAIN_2473,"Renewable Fuels Association, et al.","HollyFrontier Cheyenne Refining, LLC, et al.",Congress amended the Clean Air Act through the...,0,Congress amended the Clean Air Act through the...,Congress amended the Clean Air Act through the...,"[101, 3519, 13266, 1996, 4550, 2250, 2552, 208...",[CLS] congress amended the clean air act throu...
4952,TRAIN_2474,"Alliance Bond Fund, Inc.","Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc., an investment fund, ...",0,Alliance Bond Fund Inc an investment fund purc...,Alliance Bond Fund Inc an investment fund purc...,"[101, 4707, 5416, 4636, 4297, 2019, 5211, 4636...",[CLS] alliance bond fund inc an investment fun...
4953,TRAIN_2475,United States,Peguero,"In 1992, the District Court sentenced Manuel D...",1,In the District Court sentenced Manuel D Pegue...,In the District Court sentenced Manuel D Pegue...,"[101, 1999, 1996, 2212, 2457, 7331, 7762, 1040...",[CLS] in the district court sentenced manuel d...
4954,TRAIN_2476,St. Cyr,Immigration and Naturalization Service,"On March 8, 1996, Enrico St. Cyr, a lawful per...",1,On March Enrico The First Party Cyr a lawful p...,On March Enrico [MASK] Cyr a lawful permanent ...,"[101, 2006, 2233, 21982, 103, 22330, 2099, 103...",[CLS] on march enrico [MASK] cyr a lawful perm...


In [74]:
train_df['facts'].apply(lambda x: len(x.split())).describe()

count    4950.000000
mean      174.109899
std        82.001006
min        14.000000
25%       120.000000
50%       164.000000
75%       218.000000
max       932.000000
Name: facts, dtype: float64

In [75]:
train_df = train_df[train_df['facts'].apply(lambda x: len(x.split())) >= 14]

In [114]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_df['masked_facts'], train_df['label'], test_size=0.2, random_state=42, stratify=train_df['label'])

In [115]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=300)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [116]:
svd = TruncatedSVD(n_components=150, random_state=42)
X_train_vec = svd.fit_transform(X_train_vec)
X_val_vec = svd.transform(X_val_vec)

In [117]:
from sklearn.svm import SVC

clf = SVC(C=1, kernel='rbf', gamma='scale', probability=True)
clf.fit(X_train_vec, y_train)

In [118]:
from sklearn.metrics import roc_auc_score, accuracy_score

y_pred = clf.predict_proba(X_val_vec).argmax(axis=1)
print(roc_auc_score(y_val, y_pred))

0.7535353535353535


In [119]:
print(accuracy_score(y_val, y_pred))

0.7535353535353535


In [120]:
y_pred = clf.predict_proba(X_val_vec).argmax(axis=1)
y_pred

array([0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,

In [121]:
accuracy_score(y_val, y_pred)

0.7535353535353535

In [122]:
X_test_vec = vectorizer.transform(test_df['masked_facts'])
X_test_vec = svd.transform(X_test_vec.toarray())

In [123]:
y_pred = clf.predict_proba(X_test_vec).argmax(axis=1)
y_pred


array([0, 1, 1, ..., 0, 1, 0])

In [125]:
y_pred.mean()

0.482258064516129

In [126]:
submission = pd.DataFrame({'ID': test_df['ID'], 'first_party_winner': y_pred})

In [29]:
submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,0
4,TEST_0004,1
...,...,...
1235,TEST_1235,1
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,1


In [57]:
submission.to_csv('ml_submission4.csv', index=False)

In [58]:
tmp = pd.read_csv('ml_submission2.csv')

In [59]:
(tmp['first_party_winner'] - submission['first_party_winner']).mean()

0.00967741935483871