## Data

In [1]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')

train_df['fact_with_parties'] = 'First Party: ' + train_df['first_party'] + ', Second Party: ' + train_df['second_party'] + ', Legal Fact: ' + train_df['facts']
test_df['fact_with_parties'] = 'First Party: ' + test_df['first_party'] + ', Second Party: ' + test_df['second_party'] + ', Legal Fact: ' + test_df['facts']

train_facts = train_df['fact_with_parties'].tolist()
train_labels = train_df['first_party_winner'].astype(int).tolist()

test_facts = test_df['fact_with_parties'].tolist()

train_facts, val_facts, train_labels, val_labels = train_test_split(train_facts, train_labels, test_size=0.2, random_state=1004, stratify=train_labels)

## CountVectorizer

In [3]:
corpus = train_facts
count_vec = CountVectorizer(min_df=1, ngram_range=(1, 1))
count_array = count_vec.fit_transform(corpus).toarray().transpose()
print(len(count_vec.vocabulary_))
print(count_vec.vocabulary_)
print(count_array)

16408
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [4]:
count_df = pd.DataFrame(count_array)
count_df['count'] = count_df.sum(axis=1)
filtered_df = count_df[count_df['count'] > 1]
filtered_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1973,1974,1975,1976,1977,1978,1979,1980,1981,count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,114
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
16400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
16401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
16404,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9


In [17]:
deleted_df = count_df[count_df['count'] <= 1]
deleted_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1973,1974,1975,1976,1977,1978,1979,1980,1981,count
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16402,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16405,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
vocabulary = {v: k for v, k in enumerate(count_vec.get_feature_names_out())}
print(len(vocabulary))
print(vocabulary)

# Create a new dictionary with only the words that appear more than once
filtered_words = {vocabulary[i] for i in filtered_df.index}
print(filtered_words)
print(len(filtered_words))

16408
11272


In [6]:
count_vec_tokenizer = count_vec.build_tokenizer()

In [10]:
# Create the new corpus excluding words that appear less than twice
filtered_corpus = []
for text in corpus:
    tokenized_text = count_vec_tokenizer(text)
    text = ' '.join([word for word in text.split() if word.lower() in filtered_words])
    filtered_corpus.append(text)

print(filtered_corpus)



In [11]:
print(len(train_facts))
print(train_facts[0])

1982
First Party: Rick Perry, Governor of Texas et al., Second Party: Shannon Perez, et al., Legal Fact: The United States Census Bureau conducted a national census in 2010. In May and June of 2011, the Texas Legislature created a new electoral map based on changes in the state's population. Texas Governor Rick Perry signed the new map into law in July of 2011.
Under Section 5 of the Voting Rights Act of 1965, either the Justice Department or a special three-judge district court must approve new electoral maps before state officials may enact the map. Texas officials submitted its map to the three-judge court in Washington. The Washington court determined that state officials had used improper standards with respect to two districts. It further held that a three-judge panel in the United States District Court for the Western District of Texas must designate an interim redistricting plan for the 2012 election cycle.
The district court redrew 36 electoral districts. Governor Perry and ot

In [12]:
print(len(filtered_corpus))
print(filtered_corpus[0])

1982
First Rick Governor of Texas et Second Shannon et Legal The United States Census Bureau conducted national census in In May and June of the Texas Legislature created new electoral map based on changes in the Texas Governor Rick Perry signed the new map into law in July of Under Section of the Voting Rights Act of either the Justice Department or special district court must approve new electoral maps before state officials may enact the Texas officials submitted its map to the court in The Washington court determined that state officials had used improper standards with respect to two It further held that panel in the United States District Court for the Western District of Texas must designate an interim redistricting plan for the 2012 election The district court redrew 36 electoral Governor Perry and other state officials appealed the district redistricting to the Supreme Court and requested that the Supreme Court stop the enactment of the lower new The Supreme Court granted the 

## TfidfVectorizer

In [14]:
tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(filtered_corpus).toarray().transpose()
tfidf_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Pipeline