In [1]:
from google.colab import files

In [None]:
upload = files.upload()

Saving clean_data.csv to clean_data.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline

In [None]:
df = pd.read_csv("clean_data.csv")

In [None]:
df.columns

Index(['Unnamed: 0', 'ID', 'name', 'href', 'docket', 'term', 'first_party',
       'second_party', 'facts', 'facts_len', 'majority_vote', 'minority_vote',
       'first_party_winner', 'decision_type', 'disposition', 'issue_area'],
      dtype='object')

In [None]:
categorical_cols = ['issue_area', 'decision_type', 'disposition']
label_encoders = {}

In [None]:
for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [None]:
# Drop rows where 'first_party_winner' is NaN
df = df.dropna(subset=['first_party_winner'])

In [None]:
# Now we split the data into features and target arrays
X = df[['facts', 'issue_area_encoded', 'decision_type_encoded', 'disposition_encoded']]  # add other features if necessary
y = df['first_party_winner'].astype(int)

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# We'll vectorize the 'facts' text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # adjust parameters as necessary

In [None]:
# Fit and transform on train, transform on test
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['facts'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['facts'])

In [None]:
# Combine TF-IDF features with other features, we'll have to convert them to a dense format to concatenate with the encoded features
X_train_others = X_train.drop('facts', axis=1).reset_index(drop=True)
X_test_others = X_test.drop('facts', axis=1).reset_index(drop=True)

In [None]:
X_train_combined = pd.concat([pd.DataFrame(X_train_tfidf.toarray()), X_train_others], axis=1)
X_test_combined = pd.concat([pd.DataFrame(X_test_tfidf.toarray()), X_test_others], axis=1)

In [None]:
# Assuming X_train_combined and X_test_combined are created as before
# Convert all feature names to strings to ensure compatibility
X_train_combined.columns = [str(col) for col in X_train_combined.columns]
X_test_combined.columns = [str(col) for col in X_test_combined.columns]

# Now, the model should fit without the error
model.fit(X_train_combined, y_train)


In [None]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_combined, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test_combined)

In [None]:
# Evaluate the model's performance
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.92      0.95       252
           1       0.95      0.98      0.97       406

    accuracy                           0.96       658
   macro avg       0.96      0.95      0.96       658
weighted avg       0.96      0.96      0.96       658

Accuracy: 0.958966565349544


In [None]:
# Feature Importance
feature_importances = model.feature_importances_
feature_names = list(tfidf_vectorizer.get_feature_names_out()) + list(X_train_others.columns)
feature_importance_dict = dict(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

print("Feature Importances:", sorted_feature_importance[:20])  # Print top 20 features

Feature Importances: [('disposition_encoded', 0.4942284193169842), ('decision_type_encoded', 0.009894616876911393), ('court', 0.005106734727919689), ('issue_area_encoded', 0.0036323714525165647), ('circuit', 0.0033388956071602406), ('district', 0.003101155797368563), ('reversed', 0.002967644522618084), ('law', 0.0029616717534317162), ('appeals', 0.002945039095873099), ('state', 0.0028504089224221305), ('act', 0.0026204103534282664), ('held', 0.0025142802238844442), ('federal', 0.0024513319659460486), ('argued', 0.002270003515427614), ('did', 0.0021848732552458582), ('cases', 0.0021703001603005554), ('ninth', 0.002168387768705945), ('affirmed', 0.0020197680521126505), ('case', 0.002019556805343158), ('convicted', 0.001927235019667194)]


In [None]:
for i, pred in enumerate(y_pred):
    winner = 'First Party' if pred == 1 else 'Second Party'
    print(f"Case {i+1}: The predicted winner is {winner}")

Case 1: The predicted winner is Second Party
Case 2: The predicted winner is First Party
Case 3: The predicted winner is First Party
Case 4: The predicted winner is Second Party
Case 5: The predicted winner is First Party
Case 6: The predicted winner is Second Party
Case 7: The predicted winner is First Party
Case 8: The predicted winner is First Party
Case 9: The predicted winner is First Party
Case 10: The predicted winner is Second Party
Case 11: The predicted winner is First Party
Case 12: The predicted winner is Second Party
Case 13: The predicted winner is Second Party
Case 14: The predicted winner is Second Party
Case 15: The predicted winner is First Party
Case 16: The predicted winner is First Party
Case 17: The predicted winner is Second Party
Case 18: The predicted winner is First Party
Case 19: The predicted winner is Second Party
Case 20: The predicted winner is First Party
Case 21: The predicted winner is First Party
Case 22: The predicted winner is First Party
Case 23: T