## Data Dictionary

| Column Name      | Data Type   | Description                                                |
|------------------|-------------|------------------------------------------------------------|
| id               | integer     | The identification number of the crab species              |
| Sex              | Categorical | M for Male; F for Female; I for Infant                     |
| Length           | float       | Measured from the front (eye) carapace to the tail         |
| Diameter         | float       | Measured from one side of the carapace to the other        |
| Height           | float       | Measured from the base of the body to the top              |
| Weight           | float       | The overall weight of the crab                             |
| Shucked Weight   | float       | The weight of 'meat'                                       |
| Viscera Weight   | float       | The weight of the internal organs                          |
| Shell Weight     | float       | The weight of the shell                                    |
| Age              | integer     | The target variable and the age of the crab                |


In [8]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# set plot theme
plt.style.use('ggplot')

# set dataframe display 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 10000)

In [12]:
# import training data
train_raw = pd.read_csv('../data/train.csv')

print(train_raw.head()) # initial inspect of train df
print(train_raw.dtypes) # check datatypes

   id Sex  Length  Diameter  Height     Weight  Shucked Weight  Viscera Weight  Shell Weight  Age
0   0   I  1.5250    1.1750  0.3750  28.973189       12.728926        6.647958      8.348928    9
1   1   I  1.1000    0.8250  0.2750  10.418441        4.521745        2.324659      3.401940    8
2   2   M  1.3875    1.1125  0.3750  24.777463       11.339800        5.556502      6.662133    9
3   3   F  1.7000    1.4125  0.5000  50.660556       20.354941       10.991839     14.996885   11
4   4   I  1.2500    1.0125  0.3375  23.289114       11.977664        4.507570      5.953395    8
id                  int64
Sex                object
Length            float64
Diameter          float64
Height            float64
Weight            float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
Age                 int64
dtype: object


In [13]:
# import test data
test_raw = pd.read_csv('../data/test.csv')

print(test_raw.head()) # initial inspection of test df
print(test_raw.dtypes) # check datatypes for each variable

      id Sex  Length  Diameter  Height     Weight  Shucked Weight  Viscera Weight  Shell Weight
0  74051   I  1.0500    0.7625  0.2750   8.618248        3.657085        1.729319      2.721552
1  74052   I  1.1625    0.8875  0.2750  15.507176        7.030676        3.246018      3.968930
2  74053   F  1.2875    0.9875  0.3250  14.571643        5.556502        3.883882      4.819415
3  74054   F  1.5500    0.9875  0.3875  28.377849       13.380964        6.548735      7.030676
4  74055   I  1.1125    0.8500  0.2625  11.765042        5.528153        2.466407      3.331066
id                  int64
Sex                object
Length            float64
Diameter          float64
Height            float64
Weight            float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
dtype: object


In [15]:
# Maintain consistency between columns


def clean_headers(some_dataframe: pd.DataFrame):

    '''
    This function aims to clean a dataframe's column header by: 

    Lower case all text in the column headers
    Remove trailing and leading white spaces
    Replace the spaces between words with underscore
    
    
    '''

    some_dataframe.columns = some_dataframe.columns.str.lower() # lower case the headers
    some_dataframe.columns = some_dataframe.columns.str.strip() # remove whitespace from front and rear
    some_dataframe.columns = some_dataframe.columns.str.replace(' ', '_') # insert underscore between words

    return some_dataframe




In [16]:
# clean both the train and test dataset

train_df = clean_headers(train_raw)
test_df = clean_headers(test_raw)

print(train_df.head(2)) # inspect if column headers for train df are cleaned
print()
print(test_df.head(2)) # inspect if column headers for test df are cleaned

   id sex  length  diameter  height     weight  shucked_weight  viscera_weight  shell_weight  age
0   0   I   1.525     1.175   0.375  28.973189       12.728926        6.647958      8.348928    9
1   1   I   1.100     0.825   0.275  10.418441        4.521745        2.324659      3.401940    8

      id sex  length  diameter  height     weight  shucked_weight  viscera_weight  shell_weight
0  74051   I  1.0500    0.7625   0.275   8.618248        3.657085        1.729319      2.721552
1  74052   I  1.1625    0.8875   0.275  15.507176        7.030676        3.246018      3.968930
