In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats.stats import pearsonr
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

# Project 2 Task 5

In [2]:
data = pd.read_excel("18EA1-5 database cleaned HO only-version2.xlsx")

### Data Analysis

In [3]:
data.head(5)

Unnamed: 0,Cow,Farm,System,Parity,ParityCategory,BCS,DIM,DIC,Pregnant,FatPEBV,...,"10,12/preformed",20:2n6/preformed,22/preformed,20:3n6/preformed,20:4n6/preformed,20:5n3/preformed,MFD,(C13 - C11)/OBC,(C14 - C12)/DN,Trans as % of preformed
0,43,1,CM,5,3plus,3.25,200.0,0,N,-0.2,...,0.0,0.000726,0.000663,0.004169,0.003576,0.000746,No,0.017162,0.305421,0.075902
1,51,1,CM,5,3plus,2.75,72.0,0,U,0.07,...,0.00162,0.0,0.000624,0.001575,0.002038,0.0,Yes,0.022726,0.281875,0.1978
2,405,1,CM,5,3plus,,212.0,0,N,,...,0.0,0.001104,0.001,0.0031,0.004451,0.000705,No,0.010826,0.240153,0.08482
3,408,1,CM,4,3plus,2.75,211.0,0,U,-0.07,...,0.0,0.001005,0.000442,0.002461,0.004562,0.000727,Yes,0.013623,0.296892,0.105783
4,423,1,CM,5,3plus,2.875,228.0,0,N,-0.215,...,0.0,0.000849,0.000583,0.003857,0.003656,0.00048,Yes,0.011611,0.297311,0.098821


#### Null values in Dataset

In [4]:
null_summary = data.isnull().sum().sort_values(ascending=False)

In [5]:
for i, j in zip(null_summary.index, null_summary.values):
    print("Column: %s" % i)
    print("\tNull Count: %d\n" % j)

Column: C20:2n6
	Null Count: 959

Column: BCS
	Null Count: 452

Column: C20:1c11
	Null Count: 273

Column: C18:1t4
	Null Count: 177

Column: i130/OBC
	Null Count: 171

Column: iC13:0
	Null Count: 171

Column: C18:1t5
	Null Count: 170

Column: FatPEBV
	Null Count: 118

Column: C18:3c9c12c15
	Null Count: 116

Column: C18:3c6c9c12
	Null Count: 114

Column: aC17:0
	Null Count: 108

Column: a17/OBC
	Null Count: 108

Column: C22:0
	Null Count: 56

Column: C20:4n6
	Null Count: 32

Column: C20:3n6
	Null Count: 25

Column: C18:1t15
	Null Count: 22

Column: 17/OBC
	Null Count: 16

Column: C17:0
	Null Count: 16

Column: iC15:0
	Null Count: 14

Column: i15/OBC
	Null Count: 14

Column: Fat%
	Null Count: 13

Column: C18:1t12
	Null Count: 11

Column: C20:0
	Null Count: 9

Column: iC17:0
	Null Count: 8

Column: C18:2c9c12
	Null Count: 7

Column: iC14:0
	Null Count: 2

Column: i14/OBC
	Null Count: 2

Column: C11/OBC
	Null Count: 1

Column: C12:0
	Null Count: 1

Column: a130/OBC
	Null Count: 1

Column: 

C20:2n6 and BCS have too many null values. These are features that seem like they would be important in determining our target values. Imputing for these instances would not be smart, as it could have negative effects on our model. We will choose to remove these two features instead, in order to preserve more data. 

### Data Preprocessing

In [7]:
# get only numerical features
filtered_data = data.select_dtypes(include="float")

In [20]:
null_columns = null_summary.index[:12]

Index(['C20:2n6', 'BCS', 'C20:1c11', 'C18:1t4', 'i130/OBC', 'iC13:0',
       'C18:1t5', 'FatPEBV', 'C18:3c9c12c15', 'C18:3c6c9c12', 'aC17:0',
       'a17/OBC'],
      dtype='object')

In [8]:
numerical_features = filtered_data.columns
categorical_features = data.columns.difference(filtered_data.columns)
targets = ["AvgMilk", "Fat%", "FatY", "Pro%", "ProY"]

In [9]:
# drop target and features with too many nulls, then drop rows with null values
X = filtered_data.drop(targets + ["BCS", "C20:2n6", "C20:1c11"], axis=1).dropna()

# subset y with rows of X, and drop target rows with null values
y = data.loc[X.index, targets].dropna()

# subset X for only available rows of y
X = X.loc[y.index, :]

# normalize feature vectors
X = X.apply(lambda x: (x - x.mean()) / x.std())

### Principal Component Analysis

In [10]:
pca = PCA(n_components=10)
pca.fit(X, y["AvgMilk"])
new_X = pca.transform(X)
print(pca.explained_variance_ratio_)  
print(pca.singular_values_) 

[0.22284962 0.12881098 0.11191416 0.08802682 0.06248225 0.05303021
 0.03354962 0.02876534 0.02809733 0.02500549]
[168.37998821 128.0150308  119.32379002 105.82591531  89.15852287
  82.13838911  65.33235963  60.494975    59.78842073  56.4030007 ]


In [11]:
lasso_pca_target_scores = {}
lasso_target_scores = {}
rf_pca_target_scores = {}
rf_target_scores = {}

### Lasso with PCA

In [12]:
# lasso with PCA
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(new_X, y[target],test_size=.3)

    lasso = Lasso(alpha=0)
    lasso.fit(X_train, y_train)
    lasso_pca_target_scores[target] = lasso.score(X_test, y_test)

### Lasso without PCA

In [13]:
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(X, y[target],test_size=.3)

    lasso = Lasso(alpha=0)
    lasso.fit(X_train, y_train)
    lasso_target_scores[target] = lasso.score(X_test, y_test)

### Random Forest with PCA

In [14]:
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(new_X, y[target],test_size=.3)

    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_pca_target_scores[target] = rf.score(X_test, y_test)

### Random Forest without PCA

In [15]:
for target in targets:
    X_train, X_test, y_train, y_test = train_test_split(X, y[target],test_size=.3)
    
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_target_scores[target] = rf.score(X_test, y_test)

In [16]:
print("Lasso with PCA")
print(lasso_pca_target_scores, end="\n\n")
print("Lasso without PCA")
print(lasso_target_scores, end="\n\n")
print("Random Forest with PCA")
print(rf_pca_target_scores, end="\n\n")
print("Random Forest without PCA")
print(rf_target_scores)

Lasso with PCA
{'AvgMilk': 0.7467163422605388, 'Fat%': 0.31191828967954605, 'FatY': 0.9120399739965614, 'Pro%': 0.36942151016080205, 'ProY': 0.7575653383681301}

Lasso without PCA
{'AvgMilk': 0.9997368671997171, 'Fat%': 0.9322030097490346, 'FatY': 0.9999336826774019, 'Pro%': 0.6329898975765103, 'ProY': 0.9209132917561478}

Random Forest with PCA
{'AvgMilk': 0.702920733696305, 'Fat%': 0.38562358997226864, 'FatY': 0.8388345524421039, 'Pro%': 0.34752132613032427, 'ProY': 0.6518497662588697}

Random Forest without PCA
{'AvgMilk': 0.9176115785215354, 'Fat%': 0.5215039173169741, 'FatY': 0.9766648819339158, 'Pro%': 0.4341773086036771, 'ProY': 0.8687818234363296}


### Discovering Faults in Features

In [17]:
len(X)

1117

In [282]:
questionable_features = ["ECM", "0.4 FCM"]