# Data Cleaning, Feature Engineering Notebook

In [1]:
import numpy as np
import pandas as pd

# import surprise
# from surprise import Reader, Dataset
# from surprise.model_selection import cross_validate
# from surprise.prediction_algorithms import SVD
# from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
# from surprise.model_selection import GridSearchCV
from numpy import array
from numpy import diag
from numpy import dot
from numpy import zeros
from scipy.linalg import svd

# from scipy.sparse import csc_matrix
# from scipy.sparse.linalg import svds

# from scipy.spatial.distance import correlation, cosine
# from sklearn.metrics import pairwise_distances
# from sklearn.metrics import mean_squared_error
# from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('scotus_rulings.csv', index_col=0)

## Clean data and engineer metric
As seen by the sample rows, a justice can hold multiple opinions on a case.  They can simultaneously agree with the decision of the majority opinion but hold a separate reasoning(s) on the ruling (filing a concurrence).  With 1 being agreement and 2 filing a concurrence (and similarly 4 being dissent and 3 filing a concurrence/dissent).  The metric used here tries to capture the difference between those opinions by finding the mean of a justice's opinion (while removing filing multiple concurrences/dissents in a specific case).

In [3]:
# Map code string to metric function
def string_2_ints(s):    
    # Special case 'X' is no vote
    if s is np.nan:
        return np.nan
    if s == 'X':
        return np.nan
    
    # Split and remove non-integers
    nums = []
    for x in set(list(s)):
        try:
            nums.append(int(x))
        except:
            continue
    
    return np.mean(nums)

In [4]:
adj_df = pd.DataFrame(np.vectorize(string_2_ints)(df), index=df.index)

In [5]:
A = np.vectorize(string_2_ints)(df)

In [6]:
U, s, VT = svd(A)
# create m x n Sigma matrix
Sigma = zeros((A.shape[0], A.shape[1]))
# populate Sigma with n x n diagonal matrix
Sigma[:A.shape[1], :A.shape[1]] = diag(s)
# reconstruct matrix
B = U.dot(Sigma.dot(VT))
print(B)

ValueError: array must not contain infs or NaNs

In [None]:
justices = list(df.index)
cases = list(df.columns)
dicty = {'justice': [], 'case': [], 'vote': []}
for i in range(len(a)):
    for j in range(len(a[i])):
        dicty['justice'].append(justices[i])
        dicty['case'].append(cases[j])
        dicty['vote'].append(a[i][j])

In [None]:
new_df = pd.DataFrame.from_dict(dicty)

In [None]:
new_df.dropna(inplace=True)

In [None]:
new_df

In [None]:
reader = Reader(rating_scale=(1, 4))
data = Dataset.load_from_df(new_df, reader)

In [None]:
dataset = data.build_full_trainset()
print('Number of justices: ', dataset.n_users)
print('Number of cases: ', dataset.n_items)

In [None]:
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(data)

In [None]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

In [None]:
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [None]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

In [None]:
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,data)

In [None]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

In [None]:
svd = SVD(n_factors= 100, reg_all=0.02)
svd.fit(dataset)

In [None]:
svd.

In [None]:
P = svd.pu
Q = svd.qi
preds = P.dot(Q.T)

In [None]:
pd.DataFrame(preds)

In [None]:
adj_df