In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from scipy.stats.stats import pearsonr
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from math import sqrt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel("18EA1-5 database cleaned HO only-version2.xlsx")

### Data Analysis

In [3]:
data.head(5)

Unnamed: 0,Cow,Farm,System,Parity,ParityCategory,BCS,DIM,DIC,Pregnant,FatPEBV,...,"10,12/preformed",20:2n6/preformed,22/preformed,20:3n6/preformed,20:4n6/preformed,20:5n3/preformed,MFD,(C13 - C11)/OBC,(C14 - C12)/DN,Trans as % of preformed
0,43,1,CM,5,3plus,3.25,200.0,0,N,-0.2,...,0.0,0.000726,0.000663,0.004169,0.003576,0.000746,No,0.017162,0.305421,0.075902
1,51,1,CM,5,3plus,2.75,72.0,0,U,0.07,...,0.00162,0.0,0.000624,0.001575,0.002038,0.0,Yes,0.022726,0.281875,0.1978
2,405,1,CM,5,3plus,,212.0,0,N,,...,0.0,0.001104,0.001,0.0031,0.004451,0.000705,No,0.010826,0.240153,0.08482
3,408,1,CM,4,3plus,2.75,211.0,0,U,-0.07,...,0.0,0.001005,0.000442,0.002461,0.004562,0.000727,Yes,0.013623,0.296892,0.105783
4,423,1,CM,5,3plus,2.875,228.0,0,N,-0.215,...,0.0,0.000849,0.000583,0.003857,0.003656,0.00048,Yes,0.011611,0.297311,0.098821


#### Null values in Dataset

In [4]:
null_summary = data.isnull().sum().sort_values(ascending=False)

C20:2n6 and BCS have too many null values. These are features that seem like they would be important in determining our target values. Imputing for these instances would not be smart, as it could have negative effects on our model. We will choose to remove these two features instead, in order to preserve more data. 

# Task 6

We will first add the categorical variables and retrain our models accordingly

In [77]:
null_summary = data.isnull().sum().sort_values(ascending=False)

targets = ["AvgMilk", "Fat%", "FatY", "Pro%", "ProY"]
questionable_features = ["ECM", "0.4 FCM", "SCCS"]
null_columns = list(null_summary.index[:12])


filtered_data = data.select_dtypes(include="float")
numerical_features = filtered_data.columns
categorical_features = data.columns.difference(filtered_data.columns)

# drop targets, questionable_features, and features with too many nulls, then drop rows with null values
X = data.drop(targets + null_columns + questionable_features, axis=1).dropna()



In [7]:
#Replace categorical features with dummy columns
for cat in categorical_features:
    dummies = pd.get_dummies(X[cat]).rename(columns=lambda x: cat + '_' + str(x))
    X = pd.concat([X, dummies], axis=1)
    X = X.drop([cat], axis=1)

In [78]:
X = X.drop(["Parity","Cow"],axis=1)

In [79]:
# subset y with rows of X, and drop target rows with null values
y = data.loc[X.index, targets].dropna()

# subset X for only available rows of y
X = X.loc[y.index, :]

num_feat = numerical_features.difference(targets)
num_feat = num_feat.difference(questionable_features)
num_feat = num_feat.difference(null_columns)
# normalize feature vectors only over numerical features
X[np.array(num_feat)] = X[num_feat].apply(lambda x: (x - x.mean()) / x.std())

X = X.reset_index()
X = X.drop("index",axis=1)


In [80]:
X.head()

Unnamed: 0,Farm,System,ParityCategory,DIM,DIC,Pregnant,DN,OBC,16C,18C,...,"10,12/preformed",20:2n6/preformed,22/preformed,20:3n6/preformed,20:4n6/preformed,20:5n3/preformed,MFD,(C13 - C11)/OBC,(C14 - C12)/DN,Trans as % of preformed
0,1,CM,3plus,0.258965,0,N,-0.574733,-1.626048,0.098354,0.464811,...,-0.287626,0.64466,-1.328113,0.680696,-0.744054,0.183113,No,1.660103,1.060692,-0.412577
1,1,CM,3plus,-1.094116,0,U,-1.513597,-0.462016,-1.327251,2.358164,...,6.626223,-0.859004,-1.484332,-1.898332,-2.151314,-1.405455,Yes,3.535034,0.391109,4.819629
2,1,CM,3plus,0.385816,0,N,-1.236087,0.377621,-1.667484,2.298997,...,-0.287626,1.426665,0.051436,-0.381637,0.05677,0.097435,No,-0.47495,-0.795354,-0.029766
3,1,CM,3plus,0.375245,0,U,-0.577326,1.565306,0.860363,-0.155425,...,-0.287626,1.220311,-2.229199,-1.017587,0.158302,0.142689,Yes,0.467611,0.818176,0.870006
4,1,CM,3plus,0.554951,0,N,-0.997481,-0.391849,-0.264374,1.221744,...,-0.287626,0.898513,-1.651709,0.370336,-0.670789,-0.382518,Yes,-0.210348,0.830068,0.57118


## Training Models with Categorical Variables

In [11]:
lasso_target_scores_cat = {}
rf_target_scores_cat = {}
targets = ["AvgMilk", "Fat%", "FatY", "Pro%", "ProY"]
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(X, y[target],test_size=.2)

    lasso = Lasso(alpha=0)
    lasso.fit(X_train, y_train)
    lasso_target_scores_cat[target] = lasso.score(X_test, y_test)
    
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(X, y[target],test_size=.2)
    
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_target_scores_cat[target] = rf.score(X_test, y_test)
    



    

In [12]:
print("Lasso with Categorical Features")
print(lasso_target_scores_cat, end="\n\n")
print("Random Forest with Categorical Features")
print(rf_target_scores_cat, end="\n\n")


Lasso with Categorical Features
{'AvgMilk': 0.8103807433048387, 'Fat%': 0.7165111121776115, 'FatY': 0.9999669315529083, 'Pro%': 0.5723596068205302, 'ProY': 0.7549960946056153}

Random Forest with Categorical Features
{'AvgMilk': 0.7590614856249595, 'Fat%': 0.5956089294476881, 'FatY': 0.9920188271307946, 'Pro%': 0.5409855911000264, 'ProY': 0.758458224141088}



## PCA on entire dataset

In [30]:
pca_all = {}
for n in range(1,100):
    pca = PCA(n_components=n)
    pca.fit(X,y[target])
    new_X = pd.DataFrame(pca.transform(X))
    X_train, X_test, y_train, y_test = train_test_split(new_X, y[target],test_size=.2)
    lasso = Lasso(alpha=0)
    lasso.fit(X_train,y_train)
    pca_all[n] = np.mean(cross_val_score(lasso, new_X, y[target], cv=5))

sorted_all = sorted(pca_all.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_all[:5])

[(49, 0.765218982865557), (48, 0.7650026261948787), (89, 0.764576039475415), (90, 0.7638682222304763), (50, 0.7636973454711222)]


## PCA on subset

In [24]:
usable = ["DIM","DIC","AvgRum","Farm_1","Farm_2","Farm_3","Farm_4","Farm_5","MFD_No","MFD_Yes","ParityCategory_1","ParityCategory_2","ParityCategory_3plus","Pregnant_N","Pregnant_P","Pregnant_U","System_CM","System_SCR"]
X_usable = X[usable]
X_needs_PCA = X.drop(usable,axis=1)
X_usable = X_usable.reset_index()

print(X_needs_PCA.shape)

(1575, 101)


### Optimizing Hyper parameters for PCA

In [33]:
optimized_scores = {}
targets = ["AvgMilk"]
for target in targets:
    for n in range(1,100):
        pca = PCA(n_components=n)
        pca.fit(X_needs_PCA,y[target])
        new_X = pd.DataFrame(pca.transform(X_needs_PCA))
        new_X = X_usable.join(new_X)
        X_train, X_test, y_train, y_test = train_test_split(new_X, y[target],test_size=.2)
        lasso = Lasso(alpha=0)
        lasso.fit(X_train,y_train)
        optimized_scores[n] = np.mean(cross_val_score(lasso, new_X, y[target], cv=5))
    

In [34]:
sorted_x = sorted(optimized_scores.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_x[:5])

[(78, 0.7564194016317944), (76, 0.7563910220326788), (79, 0.7562514108564395), (77, 0.755514092146855), (75, 0.7526080178411075)]


As we can see, running PCA over a subset of the dataset doesn't benefit as we thought it would, so we're going to stick with running PCA over the entire dataset.

### Hyper parameter optimization for Random Forest

In [31]:
parameter_scores = {}
for trees in [None,10,25,50]:
    rf = RandomForestRegressor(n_estimators=100,max_depth=trees)
    parameter_scores[trees] = np.mean(cross_val_score(rf,X,y[target],cv=5))

print(parameter_scores)

{None: 0.6464476303243053, 10: 0.6469146312953848, 25: 0.6475005998766874, 50: 0.6515490467936693}


In [32]:
parameter_scores = {}
for trees in [1,3,8,20]:
    rf = RandomForestRegressor(n_estimators=100,max_depth=50,min_samples_leaf=trees)
    parameter_scores[trees] = np.mean(cross_val_score(rf,X,y[target],cv=5))
print(parameter_scores)

{1: 0.642479369992148, 3: 0.6423609233331999, 8: 0.6371783205787537, 20: 0.6015842162819055}


In [33]:
parameter_scores = {}
for trees in [1,3,8,20]:
    rf = RandomForestRegressor(n_estimators=100,max_depth=50,min_samples_leaf=trees, bootstrap=True)
    parameter_scores[trees] = np.mean(cross_val_score(rf,X,y[target],cv=5))
print(parameter_scores)

{1: 0.645257918975101, 3: 0.6434569077656944, 8: 0.6307531255124291, 20: 0.6055800384795125}


Doesn't improve our results, dumping Random Forest

## Weight Vector Analysis

In [54]:
lasso = Lasso(alpha=0)
X_train, X_test, y_train, y_test = train_test_split(X, y["AvgMilk"],test_size=.2)
lasso.fit(X_train,y_train)
print(lasso.score(X_test,y_test))

0.8466456005049867


In [72]:
coefs_dict = {}
for i in range(len(list(X))):
    coefs_dict[list(X)[i]] = lasso.coef_[i]

sorted_x = sorted(coefs_dict.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_x[:5])
rev = sorted_x[-5:]
print(rev[::-1])

[('C11:0', 12.806152370693237), ('C10:0', 12.074778437755139), ('C17:0', 11.659892536897123), ('C10/DN', 11.639437349051468), ('C14:0', 10.62044559030371)]
[('C12:0', -11.432765807243374), ('C13:0', -9.883991210270706), ('C6/DN', -9.543575700181627), ('C15:0', -9.517582430177475), ('C11/OBC', -8.457307750619744)]


### Most useless features

In [76]:
useless = sorted(coefs_dict.items(), key=lambda kv: abs(kv[1]))
print(useless[:30])

[('MFD_Yes', 2.4480453553024087e-16), ('System_SCR', 5.456774191999631e-16), ('16/16C', 0.00854680156442817), ('DIC', -0.0362964029756437), ('Pregnant_N', -0.04449736359112887), ('C24:1n9', -0.08065665857554456), ('C22:4n6', -0.0966336178438781), ('181c12/preformed', -0.10075920462386159), ('C18:1c11', -0.10300993384080817), ('t12/preformed', 0.16816737022544068), ('C22:5n3', -0.1841156021005176), ('iC17:0', -0.22940653553979146), ('Farm_2', -0.36526958707758533), ('t15/preformed', 0.3727164146702887), ('System_CM', 0.39118747703920975), ('C24:0', -0.40000678848073096), ('20:1/preformed', -0.49759738688674116), ('20:3n6/preformed', 0.5452427899102299), ('18:3,6,9,12/preformed', -0.5520408771774333), ('t4/preformed', -0.5521111735656695), ('Pregnant_U', -0.5821344545724386), ('ParityCategory_3plus', 0.5884025627436573), ('CLAc9t11', 0.6458787970669938), ('a15/OBC', -0.6920238820619072), ('t5/preformed', -0.779524018973016), ('MFD_No', -0.8102781435470087), ('AvgRum', 0.8290702044379678)