## Basic statistics with numpy

In [1]:
import numpy as np

In [2]:
#Two lists with ints
x = [12,34,5,33,23]
y = [45,23,67,12,78]

In [3]:
#Mean
np.mean(x)

21.4

In [4]:
#SD
np.s(x)

11.42978564978364

In [6]:
#Correlation coefficient
np.corrcoef([x,y])

array([[ 1.        , -0.67526293],
       [-0.67526293,  1.        ]])

## Basic statistics with scipy

In [8]:
from scipy.stats import skew, kurtosis

In [9]:
#Skewness measure
skew(x)

-0.22450768437401813

In [10]:
#Kurtosis measure
kurtosis(x)

-1.5531606797793072

## Basic statistics in pandas

In [4]:
import pandas as pd

In [5]:
#Read our video dataset
videos = pd.read_csv('YouTube_climatechange.csv')

In [6]:
videos.columns

Index(['Unnamed: 0', 'position', 'channelId', 'channelTitle', 'videoId',
       'publishedAt', 'publishedAtSQL', 'videoTitle', 'videoDescription',
       'videoCategoryId', 'videoCategoryLabel', 'duration', 'durationSec',
       'dimension', 'definition', 'caption', 'thumbnail_maxres',
       'licensedContent', 'viewCount', 'likeCount', 'dislikeCount',
       'favoriteCount', 'commentCount', 'category'],
      dtype='object')

In [9]:
#Skewness for one column
skew(videos['likeCount'])

nan

### Correlation

In [4]:
#using built-in methods
videos['viewCount'].corr(videos['likeCount'])

0.8660324366695933

In [10]:
#using scipy
from scipy.stats import pearsonr

In [11]:
#using scipy
from scipy.stats import pearsonr
#calculate p-value of correlation coefficient between points and assists
pearsonr(videos['dislikeCount'], videos['likeCount'], )

ValueError: array must not contain infs or NaNs

In [12]:
#Remove missings
videos = videos.dropna(subset=['likeCount', 'dislikeCount'])

In [13]:
pearsonr(videos['dislikeCount'], videos['likeCount'], )

(0.4578374691259312, 1.3203721424346202e-26)

## Many more options

In scipy and statsmodels you will find the most commonly used and more advanced statistical models.

More on this on Friday

In [17]:
#ANOVA with statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols

mod = ols('likeCount ~ category',
                data=videos).fit() #we define our model here: DV ~ IV and what data we use
                
aov_table = sm.stats.anova_lm(mod, typ=2) #we apply ANOVA (we could use different models here)
print(aov_table) #print results

                sum_sq     df         F    PR(>F)
category  4.271378e+09    3.0  4.789811  0.002677
Residual  1.435739e+11  483.0       NaN       NaN
