# Variance 

In [33]:
import pandas as pd

# Create our raw data
my_subjects_raw = { 'subject': ['Calculus', 'Intro to music', 'Psychology', 'Sociology'],
    'grades': [4.0, 3.3, 3.9, 4.0],
    'attendance (%)': [90, 60, 78, 100],
    'participation (%)':[100, 20, 80, 92],
    'semester': ['fall', 'spring', 'spring', 'fall']}

# Transform our raw data to pandas dataframe
my_subjects_pd = pd.DataFrame(my_subjects_raw, columns = ['subject', 'grades', 'attendance (%)', 'participation (%)',
                                                         'semester'])
# Display data
my_subjects_pd

Unnamed: 0,subject,grades,attendance (%),participation (%),semester
0,Calculus,4.0,90,100,fall
1,Intro to music,3.3,60,20,spring
2,Psychology,3.9,78,80,spring
3,Sociology,4.0,100,92,fall


In [36]:
# The function var() will display a table with all the variance values for every column. To see the variance values
# of every quantitative value we can type dataframe.var()
print(my_subjects_pd.var())

# to print the variance of only one column, we can do dataframe['nameOfColum'].var()
print(my_subjects_pd['grades'].var())

grades                  0.113333
attendance (%)        296.000000
participation (%)    1316.000000
dtype: float64
0.1133333333333334


In [67]:
# Note that when we execute my_subjects_pd.var(), without specifying a column, we will be prompted to a message.
# This is happening because we have a qualitative variable in our dataframe, which won't be considered in the table.
# To avoid this message, we can type numeric_only = True inside the parenthesis 

print(my_subjects_pd.var(numeric_only = True))

grades                  0.113333
attendance (%)        296.000000
participation (%)    1316.000000
dtype: float64


# Standard Deviation

In [35]:
# dataframe.std() will display a table with all the standard deviations (quantitative variable columns)

my_subjects.std(numeric_only = True)

grades                 0.336650
attendance (%)        17.204651
participantion (%)    36.276714
dtype: float64

In [30]:
# if we want to check only one column, we can follow the same pattern we've been using:
# dataframe.['columnName'].std()

my_subjects['grades'].std()

0.33665016461206937

# Correlation Coefficient 

In [55]:
sample_correlation = { 'subject id': ['0111', '9203', '2313', '4503'],
    'concerts attended': [3, 2, 10, 4],
    'shoe size': [7, 7, 7, 7.5]}

# Transform our raw data to pandas dataframe
sample_correlation_pd = pd.DataFrame(sample_correlation, columns = ['subject id', 'concerts attended', 'shoe size'])


In [58]:
# To calculate correlation we will make use of the corr() function. Again, we will follow the format dataframe.corr()

# Meanings:
# 1: Full correlation. If one variable increases, so does the other
# 0: No correlation.
# -1: Full negative correlation. If one variable increases, the other decreases.

# Let's see the correlation between concerts attended and shoe size. Should we expect a value closer to -1, 0 or 1?

sample_correlation_pd.corr()

Unnamed: 0,concerts attended,shoe size
concerts attended,1.0,-0.139122
shoe size,-0.139122,1.0


In [62]:
sample_correlation = { 'subject id': ['0111', '9203', '2313', '4503'],
    'class attendance (%)': [99, 65, 100, 45],
    'GPA': [4.0, 3.2, 3.8, 2.9]}

# Transform our raw data to pandas dataframe
sample_correlation_pd = pd.DataFrame(sample_correlation, columns = ['subject id', 'class attendance (%)', 'GPA'])

In [63]:
sample_correlation_pd.corr()

Unnamed: 0,class attendance (%),GPA
class attendance (%),1.0,0.982727
GPA,0.982727,1.0


# Range

In [72]:
# To calculate range we have to make use of the minimum and maximum methods.

# Let's store my_subjects_pd max and min

subjects_max = my_subjects_pd['grades'].max()
subjects_min = my_subjects_pd['grades'].min()

# Let's print both values to get a better sense of what we're doing. Note that we cannot simply add a STRING
# and an INTEGER. Thus, we have to transform both integers to string format by making use of the function str()
print("The maximum achieved grade was: " + str(subjects_max))
print("The minimum achieved grade was: " + str(subjects_min))

# Now, we're going to store the subtraction and print the final result 
range = subjects_max - subjects_min
print("Thus, grades' range is: " + str(range))

The maximum achieved grade was: 4.0
The minimum achieved grade was: 3.3
Thus, grades' range is: 0.7000000000000002


# Describing dataframes

In [None]:
# In order to get an idea of our data more quickly we can make use of three different methods.

# To look at the first few rows of a data frame we can do dataframe.head() , similarly to take a look at the
# last few rows we can type dataframe.tail() . This will come handy when we are dealing with big dataframes

In [76]:
# We can also display a data summary by typing my_subjects_pd.describe()
# This will show counts, mean, std, min, 25%, 50%, 75%, and max.

my_subjects_pd.describe()

Unnamed: 0,grades,attendance (%),participation (%)
count,4.0,4.0,4.0
mean,3.8,82.0,73.0
std,0.33665,17.204651,36.276714
min,3.3,60.0,20.0
25%,3.75,73.5,65.0
50%,3.95,84.0,86.0
75%,4.0,92.5,94.0
max,4.0,100.0,100.0


In [82]:
# Sometime, we will encounter missing data in our dataframes. To get a sense of how many values are missing we can
# type dataframe.isnull().sum()

my_subjects_pd.isnull().sum()

subject              0
grades               0
attendance (%)       0
participation (%)    0
semester             0
dtype: int64

In [83]:
my_subjects_pd

Unnamed: 0,subject,grades,attendance (%),participation (%),semester
0,Calculus,4.0,90,100,fall
1,Intro to music,3.3,60,20,spring
2,Psychology,3.9,78,80,spring
3,Sociology,4.0,100,92,fall


# Pandas plotting

We can directly plot pandas data frames by using the plot( ) method. Inside the parenthesis we can include seven different parameters.

* kind : Used to indica