### Reading csv from various places

In [None]:
import pandas as pd

In [None]:
# From a network resource
ncsv = pd.read_csv('https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv')
print(ncsv.head())

               model   mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  \
0          Mazda RX4  21.0    6  160.0  110  3.90  2.620  16.46   0   1     4   
1      Mazda RX4 Wag  21.0    6  160.0  110  3.90  2.875  17.02   0   1     4   
2         Datsun 710  22.8    4  108.0   93  3.85  2.320  18.61   1   1     4   
3     Hornet 4 Drive  21.4    6  258.0  110  3.08  3.215  19.44   1   0     3   
4  Hornet Sportabout  18.7    8  360.0  175  3.15  3.440  17.02   0   0     3   

   carb  
0     4  
1     4  
2     1  
3     1  
4     2  


In [None]:
# Direct upload to colab
from google.colab import files
from io import BytesIO

uploads = files.upload()
# File is uploaded as well as stored as bytes in a dict
# key is the uploaded file name
pseudofile = BytesIO(uploads['mtcars.csv'])
ccsv = pd.read_csv(pseudofile)

Saving mtcars.csv to mtcars.csv


Or directly upload through the sidebar in colab

### Pandas basics

In [None]:
print("Head of data:")
print(ccsv.head())
print("\nShape of data:")
print(ccsv.shape)
print("\nDescription of data:")
print(ccsv.describe())

Head of data:
               model   mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  \
0          Mazda RX4  21.0    6  160.0  110  3.90  2.620  16.46   0   1     4   
1      Mazda RX4 Wag  21.0    6  160.0  110  3.90  2.875  17.02   0   1     4   
2         Datsun 710  22.8    4  108.0   93  3.85  2.320  18.61   1   1     4   
3     Hornet 4 Drive  21.4    6  258.0  110  3.08  3.215  19.44   1   0     3   
4  Hornet Sportabout  18.7    8  360.0  175  3.15  3.440  17.02   0   0     3   

   carb  
0     4  
1     4  
2     1  
3     1  
4     2  

Shape of data:
(32, 12)

Description of data:
             mpg        cyl        disp          hp       drat         wt  \
count  32.000000  32.000000   32.000000   32.000000  32.000000  32.000000   
mean   20.090625   6.187500  230.721875  146.687500   3.596563   3.217250   
std     6.026948   1.785922  123.938694   68.562868   0.534679   0.978457   
min    10.400000   4.000000   71.100000   52.000000   2.760000   1.513000   
25%    

### Exercise 3
Chi-sqd test and t-test

In [None]:
import pandas as pd

In [None]:
# Load data
data_frame = pd.read_csv('https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv')
print("Head of data:")
print(data_frame.head())
print("\nShape of data:")
print(data_frame.shape)
print("\nDescription of data:")
print(data_frame.describe())

Head of data:
               model   mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  \
0          Mazda RX4  21.0    6  160.0  110  3.90  2.620  16.46   0   1     4   
1      Mazda RX4 Wag  21.0    6  160.0  110  3.90  2.875  17.02   0   1     4   
2         Datsun 710  22.8    4  108.0   93  3.85  2.320  18.61   1   1     4   
3     Hornet 4 Drive  21.4    6  258.0  110  3.08  3.215  19.44   1   0     3   
4  Hornet Sportabout  18.7    8  360.0  175  3.15  3.440  17.02   0   0     3   

   carb  
0     4  
1     4  
2     1  
3     1  
4     2  

Shape of data:
(32, 12)

Description of data:
             mpg        cyl        disp          hp       drat         wt  \
count  32.000000  32.000000   32.000000   32.000000  32.000000  32.000000   
mean   20.090625   6.187500  230.721875  146.687500   3.596563   3.217250   
std     6.026948   1.785922  123.938694   68.562868   0.534679   0.978457   
min    10.400000   4.000000   71.100000   52.000000   2.760000   1.513000   
25%    

In [None]:
from scipy.stats import chi2_contingency
# Chi-square test
target = data_frame.mpg
features = data_frame.columns[2:]
alpha = 0.05

for feature in features:
  stat, p, dof, expected = chi2_contingency(
      [target, data_frame[feature]]
  )
  print(f"p value for {feature} is {p}")
  if p <= alpha:
    print('Dependent features (reject H0)')
  else:
    print('Independent features (accept H0)')

p value for cyl is 0.03907998201790578
Dependent features (reject H0)
p value for disp is 2.5135660680278938e-86
Dependent features (reject H0)
p value for hp is 3.375793921389507e-51
Dependent features (reject H0)
p value for drat is 0.9999999808305179
Independent features (accept H0)
p value for wt is 0.5219273189153912
Independent features (accept H0)
p value for qsec is 0.9117996933708745
Independent features (accept H0)
p value for vs is 0.9981162454068212
Independent features (accept H0)
p value for am is 0.9943234656962614
Independent features (accept H0)
p value for gear is 0.9999985024980557
Independent features (accept H0)
p value for carb is 0.04371930844420983
Dependent features (reject H0)


In [None]:
from scipy.stats import ttest_ind

# Two-sample t-test
target = data_frame.mpg
features = data_frame.columns[2:]
alpha = 0.05

for feature in features:
  # Result is kinda like a NamedTuple
  stat, p = ttest_ind(target, data_frame[feature])
  print(f'mpg, {feature}: ', end='')
  if p <= alpha:
    print('Reject H0: Population means differ.')
  else:
    print('Accept H0: Population means are same.')

mpg, cyl: Reject H0: Population means differ.
mpg, disp: Reject H0: Population means differ.
mpg, hp: Reject H0: Population means differ.
mpg, drat: Reject H0: Population means differ.
mpg, wt: Reject H0: Population means differ.
mpg, qsec: Reject H0: Population means differ.
mpg, vs: Reject H0: Population means differ.
mpg, am: Reject H0: Population means differ.
mpg, gear: Reject H0: Population means differ.
mpg, carb: Reject H0: Population means differ.


Of course the population means differ as we are comparing two different feature sets and not same feature set from two populations.

In [None]:
alpha = 0.05
target = data_frame.mpg
stat, p = ttest_ind(target[:16], target[16:])
if p <= alpha:
  print(f'({p}) Reject H0: Population means differ.')
else:
  print(f'({p}) Accept H0: Population means are same.')

(0.07542298020349125) Accept H0: Population means are same.


Here, we split the mpg col of the dataset into two samples and performed t-test.

The means of the groups **do not** seem to have a statistically significant difference.