# Pandas

You can convert panda dataframes into numpy arrays containing only the values of the pandas dataframe using .values. (have to be careful with your datatypes though). You might want to do this to take the mean of all the data points instead of just one column which is what would happen in pandas.

dataframes can have indexes. use .loc to specify those indexes and .iloc to specify positions when there is no index.

inplace = True means that the original data will be modifies. False means that a new object will be created IT.

In [None]:
# vectorized Operations and index arrays for panda series. Notice that the first print which 
# is the first 5 rows answered like indexes. index 0 answer is 2. index 1 answer is 4. 

import pandas as pd

a = pd.Series([1, 2, 3, 4])
b = pd.Series([1, 2, 1, 2])
  
print a + b
print a * 2
print a >= 3
print a[a >= 3]

In [None]:
# Adding DataFrames with overlapping column names
# Notice that if you add a number to NaN you get Nan!

if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
    df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
    print df1 + df2

# Adding DataFrames with overlapping row indexes
if True:
    df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                       index=['row1', 'row2', 'row3'])
    df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                       index=['row4', 'row3', 'row2'])
    print df1 + df2

You can shift indexes and columns with .shift(). Look at documentation, which just happens to be poorly written as well.

In [None]:
# Cumulative entries and exits for one station for a few hours.
# Calculate difference. This is not Entries - exits. This is 
# finding the difference between numbers in it's respective column

entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    return entries_and_exits - entries_and_exits.shift(1)
    
print get_hourly_entries_and_exits(entries_and_exits)

# Another answer is .diff()
# return entries_and_exits.diff() with this you can do differences in
# columns or indexes. For columns it's .diff(axis = 1)

In [None]:
# using .diff()
# default will be differences in indexes. Differences in column
# can be done with .diff(axis = 1)
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})

def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    return entries_and_exits - entries_and_exits.shift(1)
    
print get_hourly_entries_and_exits(entries_and_exits)

## load data into dataframe

In [None]:
import pandas as pd

pd.read_csv("xxx.csv")

.head() #prints first 5 lines

.describe() # shows some statistics of each column

In [None]:
# for large files you can peak without loading all. This will load the first 6 rows
pd.read_csv("xxx.csv", nrow=6).head()

In [None]:
# Peak in the middle of the large file
pd.read_csv("xxx.csv", nrow=6, skiprows=10, header=None).head()

# Extra features not found in numpy

In [None]:
# Main difference is that panda series has has something called an index
# and columns

#numpy would look like this
# countries = np.array(["albania", "Algeria", "Andorra"])
# life_expectancy = np.array([74.7, 75., 83.4])

# pandas looks like
life_expectancy = pd.Series([74.7, 75., 83.4],
                   index = ["albania",
                            "Algeria", 
                            "Andorra"])

print life_expectancy

# NumPy arrays are like souped-up Python lists
# Pandas series is like a cross between a list and a dictionary

# you can use .loc to find using the index without needing to know the position in t
# the list
print life_expectancy.loc["Algeria"]

# When you don't have an index consider the following when accessing positions.
# you should use .iloc
print life_expectancy.iloc[0] # if there was not index givin in the series
print life_expectancy[0] # this has the same result but is confusing because it's NOT an index

In [None]:
# finding index with that has max value
# use idxmax()
employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695
]

countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Angola'
]

employment = pd.Series(employment_values, index=countries)

def max_employment(employment):
    max_country = None
    max_value = None
    
    max_country = employment.idxmax()
    max_value = employment.loc[max_country]
    
    return (max_country, max_value)

max_employment(employment)

In [None]:
# describe function lists a lot of statiscital info
a = pd.Series([1, 2, 3, 4])

a.describe()



# Drop NaN or fill missing indexes with 0

In [None]:
import pandas as pd

# Indexes that don't overlap. you get NaN
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'a', 'h'])
sum_result = s1 + s2
print sum_result

# to get rid of NaN us .dropna()
print sum_result.dropna()

# to fill in NaN with 0 you can use .fillna(0)
print sum_result.fillna(0)

In [None]:
# how to add series with missing indexes so the missing
# get added as 0 instead of NaN. Just had to google this question to 
# find this answer
import pandas as pd

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])

# Try to write code that will add the 2 previous series together,
# but treating missing values from either series as 0. The result
# when printed out should be similar to the following line:
# print pd.Series([1, 2, 13, 24, 30, 40], index=['a', 'b', 'c', 'd', 'e', 'f'])

combine = s1 + s2
s1.add(s2, fill_value = 0)

# Using .apply() to use functions instead of loops (for whole series IT)

.apply can also be used on dataframes but when you use this you are looking at calculate on the whole series (column or row) IT.

In [None]:
# you can use APPLY instead of loops to apply a function to a whole panda series. 
# notice the function is in the parenthesis of apply

names = pd.Series([
    'Andre Agassi',
    'Barry Bonds',
    'Christopher Columbus'])


# Make function to return a new series where each name
# in the input series has been transformed from the format
# "Firstname Lastname" to "Lastname, FirstName".
# Try to use the Pandas apply() function rather than a loop.

# easier to make a def for one name first to test that it works
def reverse_name(name):
    new_name = name.split(" ")
    first_name = new_name[0]
    last_name = new_name[1]
    return first_name + ", " + last_name

print reverse_name(names.iloc[0]) #.iloc clearly shows POSITION for series missing an index
    
# now that you know it works you can use APPLY() to use for whole series
def reverse_names(players):
    return names.apply(reverse_name)
    
reverse_names(names)

## Using .apply(function) on dataframes. Use this to treat whole columns as rows as data instead of each element  in dataframe. 

Note when using standard deviation like in example below: In order to get the proper computations, we should actually be setting the value of the "ddof" parameter to 0 in the .std() function.

Note that the type of standard deviation calculated by default is different between numpy's .std() and pandas' .std() functions. By default, numpy calculates a population standard deviation, with "ddof = 0". On the other hand, pandas calculates a sample standard deviation, with "ddof = 1". If we know all of the scores, then we have a population - so to standardize using pandas, we need to set "ddof = 0".

In [None]:
# example
import pandas as pd

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    '''
    return None

# see steps for solution below

In [None]:
# make function to standerdize a series (in this case it will be the exam1 or exame2)

def standardize_column(data):
    return (data - data.mean()) / data.std(ddof=0)
    
print standardize_column(grades_df["exam1"])

In [None]:
# insert that function into another function with .apply(first_function) so that you can apply it
# to the columns or rows in the dataframe
def standardize(df):
    return df.apply(standardize_column)

print standardize(grades_df)

### you can also use apply to apply functions on series in a dataframe and return just a single value instead of the whole new series again. See example below.

In [None]:
# example to use .apply for using a function to be applies to a series in dataframe (can be row or column) and
# return just a single calculation value.

import numpy as np
import pandas as pd

# To get the idea. Change False to True for this block of code to see what it does

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

if True:   
    print df.apply(np.mean)
    print df.apply(np.max) - df.apply(np.max)

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    return None

# see solution below

In [None]:
# step 1. Make function that works on one column
# sort_value() works on a panda series.

def sort_order_2nd_largest(column):
    sorted_column = column.sort_values(ascending = False)
    return sorted_column.iloc[1]

sort_order_2nd_largest(df["a"]) # test it

In [None]:
# step 2. Use apply(function) so sort_value() can be used on all series in the dataframe.
def second_largest(df):
    return df.apply(sort_order_2nd_largest)

second_largest(df)

# Using applymap to use functions to be used on all elements in a dataframe. (not treating as a series but on each element)

In [None]:
# Example
import pandas as pd

# Change False to True for this block of code to see what it does

# DataFrame applymap()
if False:
    df = pd.DataFrame({
        'a': [1, 2, 3],
        'b': [10, 20, 30],
        'c': [5, 10, 15]
    })
    
    def add_one(x):
        return x + 1
        
    print df.applymap(add_one)

In [None]:
# example 2
# convert scores to letter grades
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

# step 1. Make function that converts one element
def convert_score(grade):
    if grade >= 90:
        return "A"
    elif grade >= 80:
        return "B"
    elif grade >= 70:
        return "C"
    elif grade >= 60:
        return "D"
    else:
        return "F"
    
# test it
convert_score(80)

In [None]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)   
    
# then use applymap() to apply one function to whole datafram
def convert_grades(grades):
    return grades.applymap(convert_score)
    
convert_grades(grades_df)

# Adding dataframe to series

In [None]:
# Copy and paste these individually to see them visually as outputs (see cell below). It's easier to see what's going on.
# cool trick is to add the series with .add(series, axis = "index"). It treats the series as index instead of column (default IT)
# see example of changing axis somewhere in cell below

import pandas as pd

# Change False to True for each block of code to see what it does

# Adding a Series to a square DataFrame
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    
# Adding a Series to a one-row DataFrame 
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s

# Adding a Series to a one-column DataFrame
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10, 20, 30, 40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    

    
# Adding when DataFrame column names match Series index
if True:
    s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    
# Adding when DataFrame column names don't match Series index
if True:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s

In [None]:
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
    'a': [10, 20, 30, 40],
    'b': [50, 60, 70, 80],
    'c': [90, 100, 110, 120],
    'd': [130, 140, 150, 160]
})

print df
print '' # Create a blank line between outputs
print df + s

In [None]:
s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
    0: [10, 20, 30, 40],
    1: [50, 60, 70, 80],
    2: [90, 100, 110, 120],
    3: [130, 140, 150, 160]
})

print df
print '' # Create a blank line between outputs
print df.add(s, axis='index')
    # The functions sub(), mul(), and div() work similarly to add()

### adding dataframe to series problem with also changing axis

In [None]:
# Standerdize each ROW using vectorized operations
# equation if you did it by column would be (grades_df - grades_df.mean()) / grades_df.std(ddof=0). so you need 
# to change the axis

# look at the dataframe to see things clearly

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

grades_df

In [None]:
# get mean of each ROW
# equation if you did it by column would be (grades_df - grades_df.mean()) / grades_df.std(ddof=0). so you need 
# to change the axis

grades_df.mean(axis = "columns")

In [None]:
# you can't subtract the mean by ROW from the grades_df because your just gonna get a 
# bunch of NaNs because these means by ROW don't match up with the the columns of the df. Try it and see.
# So you need to subtract by the ROWS of the grades_df. 


# equation if you did it by column would be (grades_df - grades_df.mean()) / grades_df.std(ddof=0). so you need 
# to change the axis

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

grades_df - grades_df.mean(axis = "columns")

In [None]:
# change the axis of the df. Since you did mean by row you want to subract that mean by ROW 
# of the dataframe as well which is it's index. Use sub() to you can change the axis of the df from 
# which your subracting. Your matching your averages taken by row to the index of the dataframe

# equation if you did it by column would be (grades_df - grades_df.mean()) / grades_df.std(ddof=0). so you need 
# to change the axis

mean_diffs = grades_df.sub(grades_df.mean(axis = "columns"), axis = "index")
mean_diffs

In [None]:
# now you need to divide the standard deviation. Again, since your taking the standard deviation by row you want to 
# match those values with the dataframes indexes. So, you need to change the axis of the dataframe to "index"

# equation if you did it by column would be (grades_df - grades_df.mean()) / grades_df.std(ddof=0). so you need 
# to change the axis

mean_diffs.div(grades_df.std(ddof=0, axis = "columns"), axis = "index")

In [None]:
# If you used a function here is one way it could look. It was somcebody elses and is very clean.
def standardize_rows(df):
    '''
    Optional: Fill in this function to standardize each row of the given
    DataFrame. Again, try not to use apply().
    
    This one is more challenging than standardizing each column!
    '''
    
    for index, row in df.iterrows(): 
        mean = df.loc[index].mean()
        std = df.loc[index].std(ddof = 0)
        df.loc[index] = (df.loc[index] - mean)/std
    return df
    
    
print standardize_rows(grades_df)

# Grouping data using groupby()

you can see the mapping of the groups by using the .groups attribute when
using groupby()

In [None]:
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Change False to True for each block of code to see what it does

# Examine DataFrame
if False:
    print example_df
    
# Examine groups
if False:
    grouped_data = example_df.groupby('even')
    # The groups attribute is a dictionary mapping keys to lists of row indexes
    print grouped_data.groups
    
# Group by multiple columns
if False:
    grouped_data = example_df.groupby(['even', 'above_three'])
    print grouped_data.groups
    
# Get sum of each group
if False:
    grouped_data = example_df.groupby('even')
    print grouped_data.sum()
    
# Limit columns in result
if False:
    grouped_data = example_df.groupby('even')
    
    # You can take one or more columns from the result DataFrame
    print grouped_data.sum()['value']
    
    print '\n' # Blank line to separate results
    
    # You can also take a subset of columns from the grouped data before 
    # collapsing to a DataFrame. In this case, the result is the same.
    print grouped_data['value'].sum()

In [None]:
# complex example of how to use groupby() in pandas

import pandas as pd

engagement_df = pd.DataFrame({
    "account_key": ["1", "3", "3"],
    "utc_date": ["2015-03-04", "2015-10-02", "2015-01-21"],
    "total_minutes_visited": [331.3, 65.7, 902.4],
    "minutes_sleeping": [12.2, 20.0, 34.3]
})


# calculated mean of all students together
engagement_df.groupby("account_key").sum()["total_minutes_visited"].mean()
# you can see the mapping of the groups by using the .groups attribute when
# using groupby()
# breakdown of what this means step by step below

In [None]:
# Creates an object without you writing it equal to an
# object. To see the contents of object use .groups
# engagement_df.groupby("account_key").sum()["total_minutes_visited"].mean()
print engagement_df.groupby("account_key")
print engagement_df.groupby("account_key").groups

In [None]:
# adds each column in each group. notice it's in a dataframe and that the 
# date column is not included because it has no value. 
# engagement_df.groupby("account_key").sum()["total_minutes_visited"].mean()
engagement_df.groupby("account_key").sum()

In [None]:
# specifies one column your interested in and changes it into a pandas series 
# instead of a dataframe
# engagement_df.groupby("account_key").sum()["total_minutes_visited"].mean()
engagement_df.groupby("account_key").sum()["total_minutes_visited"]

In [None]:
# takes the mean of all of this
# engagement_df.groupby("account_key").sum()["total_minutes_visited"].mean()
engagement_df.groupby("account_key").sum()["total_minutes_visited"].mean()

In [None]:
#plot this

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%pylab inline
import seaborn as sns # makes nicer plots

# Merging several dataframes

similar to join for SQL.
Watch this video
https://www.youtube.com/watch?time_continue=82&v=vB_Et1hz_2M
you can also merge columns that have different names if they happened to be spelled differently

# Creating plots

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns # makes nicer plots

# you need this line to show the plot when using python notebook!
%matplotlib inline


data_set = pd.Series([1, 100, 500, 1076])
               
# make histograms
# data.hist()
data_set.hist() # with gridlines
data_set.plot.hist() # without gridlines

# plots
# data.plot()
# s1.plot()

In [None]:
import seaborn as sns

%pylab inline
data_set.plot()

# Dataframes

Can have different types of data (strings, int, etc) in each column and perform calculations even though they are different types.

You can convert panda dataframes into numpy arrays containing only the values of the pandas dataframe using .values. (have to be careful with your datatypes though). You might want to do this to take the mean of all the data points instead of just one column which is what would happen in pandas.

Look at the code above for an example of turning pandas dataframe into nyumpy arrays of values to perform some functions!

### Operation along axis


Operations along an axis. Most arguments in numpy take (axis = 0) or (axis = 1) argument which calculates functions for each column or each row.

(axis = 0) takes each column
(axis = 1) takes each row

In [None]:
import pandas as pd

# Pandas axis
if True:
    df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.values.sum() # se below for description of .values

### Converting pandas dataframes to numpy arrays with values to use functions on whole set

see above and below for examples. Have to be careful with datatypes. When you use .values your changing the pandas dataframe to a numpy array IT.


In [None]:
df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

df.values.sum()
df.values.max()

In [None]:
# Creating DataFrames
if True:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df_1

    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print df_2
    print df_2.values.mean()

In [None]:
import pandas as pd

ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691]],
    index= ['05-01-11', '05-02-11', '05-03-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
      
# Accessing elements
if True:
    print ridership_df.iloc[0] # not using index
    print ridership_df.loc['05-02-11'] # using index
    print ridership_df['R003'] # selecting column
    print ridership_df.iloc[1, 3] # selecting specific row and column

In [None]:
import pandas as pd

ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691]],
    index= ['05-01-11', '05-02-11', '05-03-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

# Accessing multiple rows
if True:
    print ridership_df.iloc[1:3] # doesn't include the row 3
    
# Accessing multiple columns
if True:
    print ridership_df[['R003', 'R005']]

# Correlations and Pearson R

* Pearson's R only measures LINEAR correlations!
* Pandas std() function automatically uses Bessel's correction so you need to add (ddof = 0) so that it's NOT used.
* this is just to show you what's going on. You should actually just use numpys corrcoef() function instead of doing this. It's easier.
1. standardize each variable. This converts both variables to a similar scale.
2. multiply each pair of values, and take the average
r = average (x in standard units) * (y in standard units)
Look at example below

In [None]:
import pandas as pd

def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    std_x = (x - x.mean()) / x.std(ddof=0)
    std_y = (y - y.mean()) / y.std(ddof=0)
    
    return r = (std_x * std_y).mean() # apparently doing the .mean takes the average IT

# now you can plug in differnt columns and see if they're correlated. The closer you get to 1 the more linear correlcation their is
# between those 2 columns. The closer to -1 the more negative correlation.

In [None]:
import pandas as pd
