In [1]:
import pandas as pd

In [2]:
pwd

'/Users/tlevier/Downloads'

In [3]:
cd ..

/Users/tlevier


In [4]:
# the "cd .." above removes the last thing in the sequence, so removing "Downloads" to change it to "Desktop" instead

In [5]:
cd /Users/tlevier/Desktop

/Users/tlevier/Desktop


In [6]:
# read in surveys data
surveys_df = pd.read_csv('data/surveys.csv')

In [7]:
type(surveys_df)

pandas.core.frame.DataFrame

In [8]:
type(surveys_df['sex'])

pandas.core.series.Series

In [9]:
surveys_df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


In [10]:
surveys_df['sex'].dtype
# tells us the data type is "O" which is object that also means it is a string that doesn't fit neatly into a different category

dtype('O')

In [11]:
surveys_df['record_id'].dtype

dtype('int64')

In [12]:
# get the data type of all columns in a dataframe
surveys_df.dtypes
# dtypes is for multiple data columns, not just one
# objects and strings are pretty much synonymous

record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object

In [13]:
print(5+5)
# two integers added with no decimal place, it will always come back the same as an integer

10


In [14]:
print(5/9)
# does not necessarily return an integer as it is inherently a decimal
print(8/2)
# same as an integer divided by an integer, even if it doesn't have a remainder, it will be returned as a decimal

0.5555555555555556
4.0


In [15]:
type(5/9)

float

In [16]:
type(5+5)

int

In [17]:
# they try to keep the number type as much as possible, but if it is division it just isn't possible

In [18]:
a = 7.87
type(a)

float

In [19]:
int(a)
# does not round a float, it just truncates and gets rid of everything after the decimal place

7

In [20]:
# convert float into a integer
b = int(a)
b

7

In [21]:
# can also convert it back into a float
c = float(b)
c
# will always have a decimal if it is a float, even if that means just having .0 at the end

7.0

In [22]:
surveys_df['record_id'].dtype
# it is an integer value

dtype('int64')

In [23]:
# but we can also convert record_id to floats
surveys_df['record_id'] = surveys_df['record_id'].astype('float64')

In [24]:
surveys_df['record_id'].dtype
# float is a generic object in base level python, but float 64 is designed for large data computing so it has different modified
# versions of integers and floats that are basically the same but can store more data, potentially huge values

dtype('float64')

In [25]:
# can also try to do the same thing with a different column
surveys_df.plot_id.astype('float')

0         2.0
1         3.0
2         2.0
3         7.0
4         3.0
         ... 
35544    15.0
35545    15.0
35546    10.0
35547     7.0
35548     5.0
Name: plot_id, Length: 35549, dtype: float64

In [26]:
# but what if we try to convert the weight column to an integer?
# surveys_df.weight.astype('int')
# it results in an error!
# cannot convert non-finite values (NA or inf) to integer

In [27]:
# if we go back to look at the data type of weight, we see it is a float64
surveys_df['weight'].dtype

dtype('float64')

In [28]:
# but if we look at the first few rows, you notice the weight values are all NaNs which means no value (not a number/missing value)
surveys_df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1.0,7,16,1977,2,NL,M,32.0,
1,2.0,7,16,1977,3,NL,M,33.0,
2,3.0,7,16,1977,2,DM,F,37.0,
3,4.0,7,16,1977,7,DM,M,36.0,
4,5.0,7,16,1977,3,DM,M,35.0,


In [29]:
# you can still do a fair amount of processing on these columns because it is a float and knows how to handle it
surveys_df['weight'].mean()
# it just incorporates the missing values and won't incorporate it to the summary statistics at all

42.672428212991356

In [30]:
len(surveys_df[pd.isnull(surveys_df.weight)])
# this is every case where there is a null/missing value

3266

In [31]:
len(surveys_df[surveys_df.weight > 0])
# these are the amount of rows where there is at least some value for weight/where weight is recorded

32283

In [32]:
df1 = surveys_df.copy()
# allows you to not ruin the original object, creating a copy of surveys_df

In [33]:
df1.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1.0,7,16,1977,2,NL,M,32.0,
1,2.0,7,16,1977,3,NL,M,33.0,
2,3.0,7,16,1977,2,DM,F,37.0,
3,4.0,7,16,1977,7,DM,M,36.0,
4,5.0,7,16,1977,3,DM,M,35.0,


In [34]:
df1['weight'] = df1['weight'].fillna(0)
# takes all the NaN values and fills it with zero, overrides the column df1
# even though you fill na with a 0, it still converts them to floats since the rest of the values in the column are floats

In [35]:
df1.head()
# notice that now the NaN values are zeros and no longer missing

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1.0,7,16,1977,2,NL,M,32.0,0.0
1,2.0,7,16,1977,3,NL,M,33.0,0.0
2,3.0,7,16,1977,2,DM,F,37.0,0.0
3,4.0,7,16,1977,7,DM,M,36.0,0.0
4,5.0,7,16,1977,3,DM,M,35.0,0.0


In [36]:
# have to be careful about the floats however because when asking for the mean, there will be a ton of decimals
df1.weight.mean()

38.751976145601844

In [37]:
surveys_df['weight'].mean()
# have to be careful replacing data because the new mean above is much smaller than the true mean because it took the 
# zeros into account

42.672428212991356

In [38]:
df1['weight'] = surveys_df['weight'].fillna(surveys_df['weight'].mean())
# this way it does not make the average way lower than it should be, replaces the zeros with the mean

In [39]:
df1.head()
# now it sets all initially missing weight values to the mean of the whole column

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1.0,7,16,1977,2,NL,M,32.0,42.672428
1,2.0,7,16,1977,3,NL,M,33.0,42.672428
2,3.0,7,16,1977,2,DM,F,37.0,42.672428
3,4.0,7,16,1977,7,DM,M,36.0,42.672428
4,5.0,7,16,1977,3,DM,M,35.0,42.672428


In [40]:
df1.weight.mean()
# now the mean weight is still correct as those zeros are no longer messing with the value

42.67242821299182

# Challenge

Count the number of missing values per column

In [41]:
# len(surveys_df[pd.isnull(surveys_df.weight)]) - this only accounts for one column at a time (can use any column name to replace 
# weight)
# another way to do this is with the count method
surveys_df.count()
# above only prints how many rows there are in each column

record_id          35549
month              35549
day                35549
year               35549
plot_id            35549
species_id         34786
sex                33038
hindfoot_length    31438
weight             32283
dtype: int64

In [42]:
# another thing we have not talked about yet with csv's is how to import them
# read in a clean copy of surveys.csv
surveys_df = pd.read_csv('data/surveys.csv')

In [43]:
df_na = surveys_df.dropna()
print(len(df_na))
df_na.head()
# now there are only 30676 as opposed to 35549, but we only have the cases where there are no missing data

30676


Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
62,63,8,19,1977,3,DM,M,35.0,40.0
63,64,8,19,1977,7,DM,M,37.0,48.0
64,65,8,19,1977,4,DM,F,34.0,29.0
65,66,8,19,1977,4,DM,F,35.0,46.0
66,67,8,19,1977,7,DM,M,35.0,36.0


In [44]:
df_na.to_csv('data_output/surveys_complete.csv', index = False)
# this is how to export the data for df_na we created above into a different file location
# you can also notice above that the data no longer starts at 0, see below for how to fix that
# index = False 

OSError: Cannot save file into a non-existent directory: 'data_output'

In [45]:
df_na_reset = df_na.reset_index(drop = True)
df_na_reset.head()
# you may not always want to change the new
# if you exclude drop = true in the parenthesis, you would be able to see the original index number next to it

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,63,8,19,1977,3,DM,M,35.0,40.0
1,64,8,19,1977,7,DM,M,37.0,48.0
2,65,8,19,1977,4,DM,F,34.0,29.0
3,66,8,19,1977,4,DM,F,35.0,46.0
4,67,8,19,1977,7,DM,M,35.0,36.0


In [46]:
# reset the index in place
df_na.reset_index(drop = True, inplace = True)
# if the indexes are no longer starting at 0 you could get some weird behaviors or error
df_na.head()
# if you don't call inplace, it would return a new data frame that resets the index, you can do the equal sign code on line 92 top
# which overwrites it, and that can also be done automatically with the inplace = True function

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,63,8,19,1977,3,DM,M,35.0,40.0
1,64,8,19,1977,7,DM,M,37.0,48.0
2,65,8,19,1977,4,DM,F,34.0,29.0
3,66,8,19,1977,4,DM,F,35.0,46.0
4,67,8,19,1977,7,DM,M,35.0,36.0


In [47]:
# Creates a new line called "unnamed 0" whenever it creates a new csv. It creates a new index called "unnamed" which is not
# necessary hence why you put index = false because you don't want it to write that name and have that extraneous line