In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.linalg as la
import math
import matplotlib
import matplotlib.pyplot as plt

## Project Focus:

Fit a regression line to the provided data to estimate and predict abalone age based on multiple physical attributes.
<br>
Provide reasoning for multivariate techniques as well as assumptions on that makes them valid to use.

## Credentials

Data comes from an original study:

	Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and
	Wes B Ford (1994) "The Population Biology of Abalone (_Haliotis_
	species) in Tasmania. I. Blacklip Abalone (_H. rubra_) from the North
	Coast and Islands of Bass Strait", Sea Fisheries Division, Technical
	Report No. 48 (ISSN 1034-3288)
    
Original owners of data:
    
	Marine Resources Division
	Marine Research Laboratories - Taroona
	Department of Primary Industry and Fisheries, Tasmania
	GPO Box 619F, Hobart, Tasmania 7001, Australia
	(contact: Warwick Nash +61 02 277277, wnash@dpi.tas.gov.au)

## Cleaning Data

Attribute information:

| Variable  | Data Type | Units | Description |
| :--- | :--- | :--- | :--- |
| Sex | Ordinal |	M, F, I | M = male, F = female, I = infant |
| Length | Continuous |	mm | longest shell measurement |
| Diameter | Continuous | mm | perpendicular to length |
| Height | Continuous | mm | with meat in shell |
| Whole weight | Continuous | grams | whole abalone |
| Shucked weight | Continuous | grams |weight of meat |
| Viscera weight | Continuous | grams | gut weight (after bleeding) |
| Shell weight | Continuous | grams | after being dried |
| Rings | Integer | | +1.5 gives the age in years |

   



In [31]:
# Import the .data file into a dataframe with column headers listed below.
Aba_data = pd.read_csv("abalone.data", names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight','Ring Count'])

#display(Aba_data)

# Next step is to seperate the male, female, and infant abalone data to minimize confounding factors. 
# Invoking the "df.loc" method to return the seperated abalone data based on sex to their new corresponding dataframes.  
Aba_data_male = Aba_data.loc[Aba_data['Sex'] == 'M']
Aba_data_female = Aba_data.loc[Aba_data['Sex'] == 'F']  
Aba_data_inf = Aba_data.loc[Aba_data['Sex'] == 'I']

# Dropping the "Sex" column within their respective dataframes to reduce redundant information.
Aba_data_male = Aba_data_male.drop(['Sex'], axis = 1)
Aba_data_female = Aba_data_female.drop(['Sex'], axis = 1)
Aba_data_inf = Aba_data_inf.drop(['Sex'], axis = 1)

# Resetting index starting at 0 enumerating until the last data point within respective dataframes making sure to not include the old segmented indices.
Aba_data_male = Aba_data_male.reset_index(drop = True)
Aba_data_female = Aba_data_female.reset_index(drop = True)
Aba_data_inf = Aba_data_inf.reset_index(drop = True)

# Non-destructive measurements are length, height, diameter, and weight. Removing the shucked, viscera, and shell weight data.
# Keeping the ring count for future use.
Aba_data_male = Aba_data_male.drop(['Shucked Weight', 'Viscera Weight', 'Shell Weight'], axis = 1)
Aba_data_female = Aba_data_female.drop(['Shucked Weight', 'Viscera Weight', 'Shell Weight'], axis = 1)
Aba_data_inf = Aba_data_inf.drop(['Shucked Weight', 'Viscera Weight', 'Shell Weight'], axis = 1)

display(Aba_data_male, Aba_data_female, Aba_data_inf)


Unnamed: 0,Length,Diameter,Height,Whole Weight,Ring Count
0,0.455,0.365,0.095,0.5140,15
1,0.350,0.265,0.090,0.2255,7
2,0.440,0.365,0.125,0.5160,10
3,0.475,0.370,0.125,0.5095,9
4,0.430,0.350,0.110,0.4060,10
...,...,...,...,...,...
1523,0.550,0.430,0.130,0.8395,10
1524,0.560,0.430,0.155,0.8675,8
1525,0.590,0.440,0.135,0.9660,10
1526,0.600,0.475,0.205,1.1760,9


Unnamed: 0,Length,Diameter,Height,Whole Weight,Ring Count
0,0.530,0.420,0.135,0.6770,9
1,0.530,0.415,0.150,0.7775,20
2,0.545,0.425,0.125,0.7680,16
3,0.550,0.440,0.150,0.8945,19
4,0.525,0.380,0.140,0.6065,14
...,...,...,...,...,...
1302,0.585,0.475,0.165,1.0530,11
1303,0.585,0.455,0.170,0.9945,11
1304,0.515,0.400,0.125,0.6150,8
1305,0.565,0.450,0.165,0.8870,11


Unnamed: 0,Length,Diameter,Height,Whole Weight,Ring Count
0,0.330,0.255,0.080,0.2050,7
1,0.425,0.300,0.095,0.3515,8
2,0.355,0.280,0.085,0.2905,7
3,0.380,0.275,0.100,0.2255,10
4,0.240,0.175,0.045,0.0700,5
...,...,...,...,...,...
1337,0.480,0.355,0.110,0.4495,8
1338,0.390,0.310,0.085,0.3440,7
1339,0.390,0.290,0.100,0.2845,7
1340,0.405,0.300,0.085,0.3035,7


## Descriptive Statistics for Length, Diameter, Height, Whole Weight, and Ring Count

Calculate sample mean, sample variance, interquartile range, and correlation coefficient for predictor variables and response variable for each dataframe.

In [13]:
# To calculate the descriptive statistics and more, the number of observations is needed. Here I'm invoking the "len(df)" method on each dataframe to their respective number of observations.
n_male = len(Aba_data_male)
n_female = len(Aba_data_female)
n_inf = len(Aba_data_inf)

print(n_male, n_female, n_inf) 

1528 1307 1342


#### Sample Mean for Length, Diameter, Height, Whole Weight, and Ring Count.

In [25]:
# Incredibly easy method to invoke is the "df.mean()".
# Returns the mean of each column as a Panda Series.

Aba_data_male_mean = Aba_data_male.mean(axis = 0)
Aba_data_female_mean = Aba_data_female.mean(axis = 0)
Aba_data_inf_mean = Aba_data_inf.mean(axis = 0)

display(Aba_data_male_mean, Aba_data_female_mean, Aba_data_inf_mean) 


Length           0.561391
Diameter         0.439287
Height           0.151381
Whole Weight     0.991459
Ring Count      10.705497
dtype: float64

Length           0.579093
Diameter         0.454732
Height           0.158011
Whole Weight     1.046532
Ring Count      11.129304
dtype: float64

Length          0.427746
Diameter        0.326494
Height          0.107996
Whole Weight    0.431363
Ring Count      7.890462
dtype: float64

#### Sample standard deviation for Length, Diameter, Height, Whole Weight, and Ring Count.

In [30]:
Aba_data_male_std = Aba_data_male.std(axis = 0)
Aba_data_female_std = Aba_data_female.std(axis = 0)
Aba_data_inf_std = Aba_data_inf.std(axis = 0)

display(Aba_data_male_std, Aba_data_female_std, Aba_data_inf_std) 

Length          0.102697
Diameter        0.084398
Height          0.034804
Whole Weight    0.470581
Ring Count      3.026349
dtype: float64

Length          0.086160
Diameter        0.070954
Height          0.039984
Whole Weight    0.430316
Ring Count      3.104256
dtype: float64

Length          0.108858
Diameter        0.088109
Height          0.031995
Whole Weight    0.286275
Ring Count      2.511554
dtype: float64

#### Sample variance for Length, Diameter, Height, Whole Weight, and Ring Count.

In [29]:
Aba_data_male_var = np.power(Aba_data_male_std, 2)
Aba_data_female_var = np.power(Aba_data_female_std, 2)
Aba_data_inf_var = np.power(Aba_data_inf_std, 2)

display(Aba_data_male_var, Aba_data_female_var, Aba_data_inf_var )

Length          0.010547
Diameter        0.007123
Height          0.001211
Whole Weight    0.221447
Ring Count      9.158791
dtype: float64

Length          0.007423
Diameter        0.005034
Height          0.001599
Whole Weight    0.185172
Ring Count      9.636407
dtype: float64

Length          0.011850
Diameter        0.007763
Height          0.001024
Whole Weight    0.081953
Ring Count      6.307903
dtype: float64

#### First Quartile, Median (Second Quartile), Third Quartile, and Interquartile Range with Outliers.