In [22]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats.stats import pearsonr
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

# Project 2 Task 5

In [79]:
data = pd.read_excel("18EA1-5 database cleaned HO only-version2.xlsx")

### Data Analysis

In [2]:
data.head(5)

Unnamed: 0,Cow,Farm,System,Parity,ParityCategory,BCS,DIM,DIC,Pregnant,FatPEBV,...,"10,12/preformed",20:2n6/preformed,22/preformed,20:3n6/preformed,20:4n6/preformed,20:5n3/preformed,MFD,(C13 - C11)/OBC,(C14 - C12)/DN,Trans as % of preformed
0,43,1,CM,5,3plus,3.25,200.0,0,N,-0.2,...,0.0,0.000726,0.000663,0.004169,0.003576,0.000746,No,0.017162,0.305421,0.075902
1,51,1,CM,5,3plus,2.75,72.0,0,U,0.07,...,0.00162,0.0,0.000624,0.001575,0.002038,0.0,Yes,0.022726,0.281875,0.1978
2,405,1,CM,5,3plus,,212.0,0,N,,...,0.0,0.001104,0.001,0.0031,0.004451,0.000705,No,0.010826,0.240153,0.08482
3,408,1,CM,4,3plus,2.75,211.0,0,U,-0.07,...,0.0,0.001005,0.000442,0.002461,0.004562,0.000727,Yes,0.013623,0.296892,0.105783
4,423,1,CM,5,3plus,2.875,228.0,0,N,-0.215,...,0.0,0.000849,0.000583,0.003857,0.003656,0.00048,Yes,0.011611,0.297311,0.098821


#### Null values in Dataset

In [217]:
null_summary = data.isnull().sum().sort_values()

In [226]:
for i, j in zip(null_summary.index, null_summary.values):
    print("Column: %s" % i)
    print("\tNull Count: %d\n" % j)

Column: Cow
	Null Count: 0

Column: 16/16C
	Null Count: 0

Column: i16/OBC
	Null Count: 0

Column: 15/OBC
	Null Count: 0

Column: 14:1c9/DN
	Null Count: 0

Column: a15/OBC
	Null Count: 0

Column: 14/DN
	Null Count: 0

Column: 13/OBC
	Null Count: 0

Column: C10:1c9/DN
	Null Count: 0

Column: C10/DN
	Null Count: 0

Column: C8/DN
	Null Count: 0

Column: 16c9/16C
	Null Count: 0

Column: C6/DN
	Null Count: 0

Column: C22:5n3
	Null Count: 0

Column: C22:4n6
	Null Count: 0

Column: C24:1n9
	Null Count: 0

Column: C24:0
	Null Count: 0

Column: C20:5n3
	Null Count: 0

Column: CLAt10c12
	Null Count: 0

Column: C18:1c12
	Null Count: 0

Column: C18:1c11
	Null Count: 0

Column: C18:1c9
	Null Count: 0

Column: C18:1t10
	Null Count: 0

Column: C4/DN
	Null Count: 0

Column: 17:1c10/OBC
	Null Count: 0

Column: 18:0/preformed
	Null Count: 0

Column: t4/preformed
	Null Count: 0

Column: MFD
	Null Count: 0

Column: 20:5n3/preformed
	Null Count: 0

Column: 20:4n6/preformed
	Null Count: 0

Column: 20:3n6/pr

C20:2n6 and BCS have too many null values. These are features that seem like they would be important in determining our target values. Imputing for these instances would not be smart, as it could have negative effects on our model. We will choose to remove these two features instead, in order to preserve more data. 

### Data Preprocessing

In [238]:
# get only numerical features
filtered_data = data.select_dtypes(include="float")

In [240]:
numerical_features = filtered_data.columns
categorical_features = data.columns.difference(filtered_data.columns)
targets = ["AvgMilk", "Fat%", "FatY", "Pro%", "ProY"]

In [241]:
# drop target and features with too many nulls, then drop rows with null values
X = filtered_data.drop(targets + ["BCS", "C20:2n6"], axis=1).dropna()

# subset y with rows of X, and drop target rows with null values
y = data.loc[X.index, targets].dropna()

# subset X for only available rows of y
X = X.loc[y.index, :]

# normalize feature vectors
X = X.apply(lambda x: (x - x.mean()) / x.std())

### Principal Component Analysis

In [185]:
pca = PCA(n_components=10)
pca.fit(X, y["AvgMilk"])
new_X = pca.transform(X)
print(pca.explained_variance_ratio_)  
print(pca.singular_values_) 

[9.12970590e-01 5.41398360e-02 2.81812590e-02 4.23315307e-03
 3.17221866e-04 1.04856566e-04 3.37852606e-05 9.17645160e-06
 3.06981736e-06 2.28280724e-06 1.79754699e-06 8.83225043e-07
 6.46079343e-07 3.56652563e-07 3.18712608e-07 2.02603777e-07
 1.75246241e-07 7.27161304e-08 6.69674179e-08 5.23718398e-08
 4.16886064e-08 3.44461978e-08 2.85432564e-08 1.46252783e-08
 1.31567192e-08 1.19436602e-08 1.01156479e-08 7.05144199e-09
 5.00635205e-09 3.92325211e-09 3.44348877e-09 3.06167917e-09
 2.43487501e-09 2.36943674e-09 2.27135323e-09 1.76785063e-09
 1.53608531e-09 1.37761922e-09 1.15268439e-09 1.00497087e-09
 8.22753987e-10 7.18816744e-10 6.54363623e-10 5.94824968e-10
 4.95848585e-10 4.79463042e-10 3.54333889e-10 3.28957448e-10
 2.92925495e-10 2.17874801e-10 1.86234201e-10 1.72035159e-10
 1.18519046e-10 1.18118741e-10 9.70962495e-11 8.93400272e-11
 7.91509183e-11 5.82358635e-11 5.07681735e-11 4.80910223e-11
 3.79116828e-11 3.72377152e-11 3.02416585e-11 2.53612166e-11
 2.42160477e-11 2.167645

In [249]:
lasso_pca_target_scores = {}
lasso_target_scores = {}
rf_pca_target_scores = {}
rf_target_scores = {}

### Lasso with PCA

In [250]:
# lasso with PCA
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(new_X, y[target],test_size=.3)

    lasso = Lasso(alpha=0)
    lasso.fit(X_train, y_train)
    lasso_pca_target_scores[target] = lasso.score(X_test, y_test)

### Lasso without PCA

In [251]:
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(X, y[target],test_size=.3)

    lasso = Lasso(alpha=0)
    lasso.fit(X_train, y_train)
    lasso_target_scores[target] = lasso.score(X_test, y_test)

### Random Forest with PCA

In [252]:
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(new_X, y[target],test_size=.3)

    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_pca_target_scores[target] = rf.score(X_test, y_test)

### Random Forest without PCA

In [253]:
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(X, y[target],test_size=.3)
    
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_target_scores[target] = rf.score(X_test, y_test)

In [258]:
print("Lasso with PCA")
print(lasso_pca_target_scores, end="\n\n")
print("Lasso without PCA")
print(lasso_target_scores, end="\n\n")
print("Random Forest with PCA")
print(rf_pca_target_scores, end="\n\n")
print("Random Forest without PCA")
print(rf_target_scores)

Lasso with PCA
{'AvgMilk': 0.9992248455187912, 'Fat%': 0.9328369828714774, 'FatY': 0.999805023749142, 'Pro%': 0.5549007947789224, 'ProY': 0.9213173826209089}

Lasso without PCA
{'AvgMilk': 0.9997113018488337, 'Fat%': 0.9332203418358187, 'FatY': 0.9999535620908748, 'Pro%': 0.5675893437976787, 'ProY': 0.9188849162504147}

Random Forest with PCA
{'AvgMilk': 0.8592430853001051, 'Fat%': 0.49250274102291863, 'FatY': 0.9473568418796283, 'Pro%': 0.510833771295502, 'ProY': 0.8482173035486043}

Random Forest without PCA
{'AvgMilk': 0.8932390550083047, 'Fat%': 0.5894251881339044, 'FatY': 0.9807793988606976, 'Pro%': 0.5274398450105359, 'ProY': 0.8581172958337827}
