# Imports

In [53]:
import numpy as np
import pandas as pd

# Series

In [54]:
# Pandas series are different from NumPy arrays because they can be indexed by labels instead of integers
my_list = [1, 2, 3, 4]

In [55]:
# cast my_list as a Pandas series
pd.Series(data = my_list)

0    1
1    2
2    3
3    4
dtype: int64

In [56]:
# Now let's change the indeces to letters!
new_indeces = ['a', 'b', 'c', 'd']
my_series = pd.Series(data = my_list, index = new_indeces)
my_series

a    1
b    2
c    3
d    4
dtype: int64

In [57]:
# Now to access the third element, just use the corresponding label "c"
my_series['c']

3

# DataFrames

In [58]:
# pd.read_csv("name of file")

In [59]:
from numpy.random import randint
df = pd.DataFrame(randint(0, 100, [8, 4]), ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], ['I', 'J', 'K', 'L'])
df

Unnamed: 0,I,J,K,L
A,19,98,68,31
B,26,43,39,23
C,21,8,45,25
D,68,93,48,74
E,22,5,32,93
F,71,29,19,70
G,0,63,98,79
H,53,25,37,82


In [77]:
df.head()

Unnamed: 0,I,J,K,L
A,19,98,68,31
B,26,43,39,23
C,21,8,45,25
D,68,93,48,74
E,22,5,32,93


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, A to H
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   I       8 non-null      int64
 1   J       8 non-null      int64
 2   K       8 non-null      int64
 3   L       8 non-null      int64
dtypes: int64(4)
memory usage: 320.0+ bytes


In [62]:
df.describe()

Unnamed: 0,I,J,K,L
count,8.0,8.0,8.0,8.0
mean,35.0,45.5,48.25,59.625
std,25.734912,35.984123,24.516758,28.445123
min,0.0,5.0,19.0,23.0
25%,20.5,20.75,35.75,29.5
50%,24.0,36.0,42.0,72.0
75%,56.75,70.5,53.0,79.75
max,71.0,98.0,98.0,93.0


# Accession

In [63]:
df['J']

A    98
B    43
C     8
D    93
E     5
F    29
G    63
H    25
Name: J, dtype: int64

In [64]:
df.loc['E']

I    22
J     5
K    32
L    93
Name: E, dtype: int64

In [65]:
df.iloc[4]

I    22
J     5
K    32
L    93
Name: E, dtype: int64

In [66]:
# NOTE: the difference between .loc and .iloc is that loc accesses the dataframe through labels
# and .iloc accesses the dataframe through the integer locations of items.

In [78]:
# This returns a Boolean (true/false) based on what rows in column "K" are equal to 4
df["K"]==48

A    False
B    False
C    False
D     True
E    False
F    False
G    False
H    False
Name: K, dtype: bool

In [79]:
# instead of a Boolean, this returns the row that contains the number 4
df[df['K']==48]

Unnamed: 0,I,J,K,L
D,68,93,48,74


In [69]:
# create a new column in a dataframe
# Note that this is only one of mulitple ways to do this
df.insert(4, "M", [21, 23, 24, 21, 15, 84, 12, 38])
df

# the first argument is the integer position of the column
# the second argument is the name of the new column
# the third argument is a series of the values of the new column

Unnamed: 0,I,J,K,L,M
A,19,98,68,31,21
B,26,43,39,23,23
C,21,8,45,25,24
D,68,93,48,74,21
E,22,5,32,93,15
F,71,29,19,70,84
G,0,63,98,79,12
H,53,25,37,82,38


In [70]:
# To delete a column of a dataframe...
del df['M']
df

Unnamed: 0,I,J,K,L
A,19,98,68,31
B,26,43,39,23
C,21,8,45,25
D,68,93,48,74
E,22,5,32,93
F,71,29,19,70
G,0,63,98,79
H,53,25,37,82


# DataFrame Methods

In [71]:
# the .apply() method allows you to apply a function to an axis of a DataFrame
# In this example, we will add up the colums of our DataFrame

def col_sum(data):
  return data.sum()

data = df.apply(col_sum)

data

I    280
J    364
K    386
L    477
dtype: int64

In [72]:
# Now let's do the same things but with a Lambda function, and adding the rows using axis=1

data = df.apply(lambda data: data.sum(), axis=1)

data

A    216
B    131
C     99
D    283
E    152
F    189
G    240
H    197
dtype: int64

# Null Values

In [73]:
# The DataFrame below has a null value
data2 = {'Hat': [1, 2, 3, 4], 'Turtle': [5, 6, np.nan, 8], 'Ponies':[9, 10, 11, 12]}
df2 = pd.DataFrame(data=data2)
df2

Unnamed: 0,Hat,Turtle,Ponies
0,1,5.0,9
1,2,6.0,10
2,3,,11
3,4,8.0,12


In [74]:
# To deal with a null value, either drop it or fill it in

# To drop...
df2.dropna()

Unnamed: 0,Hat,Turtle,Ponies
0,1,5.0,9
1,2,6.0,10
3,4,8.0,12


In [75]:
# ...or maybe we can take the average of the column entries to estimate this value
df2.fillna(df2['Turtle'].mean())

Unnamed: 0,Hat,Turtle,Ponies
0,1,5.0,9
1,2,6.0,10
2,3,6.333333,11
3,4,8.0,12
