In [33]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in dataset 1
data = pd.read_excel("dataset_1.xls")

# Data Cleaning

From the initial viewing of the data, we noticed that many instances had missing data for some dates (usually tdate10 through tdate12). For now, we have set those values to NaN.

We also noticed whitespace occurred in the columns. We erased them and lowercased the names for simplicity.

In [3]:
# Example of an instance without data for each date
data.iloc[0,:]

State                          11
 Herd                           1
# PTAs                        581
 PTAM                         272
PTAF                           17
PTAP                           11
RHA # cows                    590
 % W PTAs                      98
 RHAM                       29810
 RHAF                        1093
 RHAP                         921
Tdate1        2018-08-15 00:00:00
 Milk1                         86
 Fat1                         3.4
 Pro1                         3.1
Tdate2        2018-07-21 00:00:00
 Milk2                         87
 Fat2                         3.5
 Pro2                           3
Tdate3        2018-06-13 00:00:00
 Milk3                         86
 Fat3                         3.4
 Pro3                           3
Tdate4        2018-05-16 00:00:00
 Milk4                         86
 Fat4                         3.4
 Pro4                         3.1
Tdate5        2018-04-11 00:00:00
 Milk5                         80
 Fat5         

In [4]:
# Remove whitespace from column names
data.columns = list(map(lambda x: x.strip().lower(), data.columns))

# Find values of '  .' and set to NaN
for col in data.columns:
    try:
        data[col][data[col] == '  .'] = np.nan
        data[col][data[col] == ' . '] = np.nan
        data[col][data[col].apply(lambda x: x == '    .')] = \
            (data[col][data[col].apply(lambda x: x != '    .')]).mean()
        if "date" not in col:
            data[col] = data[col].astype(float)
    except:
        continue
        
data["rhap"][data.rhap.apply(lambda x: x == '   .')] = data.rhap[data.rhap.apply(lambda x: x != '   .')].mean()
data["rhaf"][data.rhaf.apply(lambda x: x == '   .')] = data.rhaf[data.rhaf.apply(lambda x: x != '   .')].mean()

# Data Analysis

In [5]:
# Dimensions of data
data.shape

(7919, 59)

In [6]:
# Initial summary statistics
data.describe()

Unnamed: 0,state,herd,# ptas,ptam,ptaf,ptap,rha # cows,% w ptas,rham,milk1,...,pro9,milk10,fat10,pro10,milk11,fat11,pro11,milk12,fat12,pro12
count,7919.0,7919.0,7919.0,7919.0,7919.0,7919.0,7919.0,7919.0,7919.0,7916.0,...,6383.0,5882.0,5866.0,5866.0,5101.0,5091.0,5091.0,3806.0,3798.0,3798.0
mean,31.738603,3960.0,142.206971,150.691375,9.027024,5.764996,216.199899,61.779265,22959.591208,61.218924,...,3.150807,62.027882,3.87523,3.140811,61.72574,3.805637,3.0967,62.246978,3.779568,3.064007
std,11.987906,2286.162724,355.044773,356.828884,12.999496,10.967446,441.442926,28.500609,4041.978663,12.303426,...,0.134838,11.673558,0.320722,0.137292,11.787126,0.314055,0.127777,11.525574,0.301237,0.122081
min,11.0,1.0,1.0,-2940.0,-108.0,-99.0,2.0,0.0,462.0,2.0,...,2.5,15.0,2.2,2.5,17.0,2.4,2.5,15.0,2.4,2.5
25%,23.0,1980.5,31.0,-2.5,3.0,1.0,60.0,44.0,20590.5,54.0,...,3.1,55.0,3.7,3.1,54.0,3.6,3.0,55.0,3.6,3.0
50%,31.0,3960.0,57.0,213.0,11.0,7.0,95.0,71.0,23321.0,62.0,...,3.1,63.0,3.9,3.1,62.0,3.8,3.1,63.0,3.8,3.1
75%,41.0,5939.5,117.0,370.0,17.0,12.0,188.0,84.0,25659.5,70.0,...,3.2,70.0,4.1,3.2,70.0,4.0,3.2,70.0,3.9,3.1
max,74.0,7919.0,9789.0,1234.0,63.0,44.0,9008.0,250.0,36275.0,129.0,...,3.7,100.0,6.6,3.8,102.0,6.2,3.7,100.0,5.9,3.7


In [7]:
# Amount of unique states the herds are from
len(data.state.unique())

39

In [8]:
# Amount of unique herds, since each row is a unique herd, should be the same amount
len(data.herd.unique())

7919

# Data Visualizations

# Model Creation

In [56]:
milk_columns = []
for col in data.columns:
    if "milk" in col:
        milk_columns.append(col)
feature_cols = ['# ptas', 'ptam', 'ptaf', 'ptap', 'rha # cows',
                '% w ptas', 'rham', 'rhaf', 'rhap']

X = data.loc[:, feature_cols]
y = data.loc[:, milk_columns].mean(axis=1, skipna=True)

X = X[~y.isna()]
y = y[~y.isna()]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lasso = Lasso(alpha=5)

cross_val_score(lasso, X_train, y_train, cv=5)

lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

0.9848636173130209