# Numerical Python -- continued

In [16]:
from nose.tools import assert_equal  # tool for testing solutions
import numpy as np

### 1. Write and use a class

Consider the data files located in `data/MCH`. Notice that there are many of the files, and that it would be a lot of work to read them all in by hand. Since there is common information in the files, let's write a class to interact with the files.

Write a class, `CTD`, to store some of the information in a single file within a class. The class should:

* be initialized with the name of the file, `filename`, and this value should be stored in the class as an attribute `CTD.filename`;
* contain a method, `salt_mean()`, which returns the mean of the salinity;
* contain a method, `temp_max()`, which returns the maximum value of the temperature values.
* contain a method, `combine(alpha, beta)` that returns a linear combination of `temp` and `salt` based on the equation: `alpha * temp + beta * salt`

The structure for `CTD` is provided below — you  need to fill in the code to make the class work properly.

In [18]:
class CTD(object):
    """Holds some data."""
    # YOUR CODE HERE
      
    def __init__(self, filename):
        self.filename = filename
        f = open(filename)
        self.salt = []
        self.temp = []
        for line in f.readlines():
            if line[0] == ' ':
                self.temp.append(float(line.split()[2]))
                self.salt.append(float(line.split()[5]))
#         self.salt = np.asarray(salt)
#         self.temp = np.asarray(temp)
        self.salt = np.asarray(self.salt)
        self.temp = np.asarray(self.temp)
        
            


#     def salt():  
#         with open(filename, 'r') as f:
#             data = []
#             i=0
#             lines=f.readlines()
#             for line in lines:
#                 values = line.split()
#                 i+=1
#                 if values[0]=='*END*':
#                     idex=i

#             for line in lines[idex+1:]:
#                 values = [float(value) for value in line.split()]
#                 data.append(values)
#         return data[:,5]
    def salt_mean(self):  
        return self.salt.mean()
    
    def temp_max(self):
        return self.temp.max()

    
#     def temp_max(self):
#         data=loadtxt(filename,comments='*')
#         return np.max(data[:,2])
#     def combine(self, alpha, beta):
#         print(alpha*self.salt()+beta*self.temp())
#         return alpha*self.salt()+beta*self.temp()
    def combine(self, alpha, beta):
#         return float(alpha[np.newaxis])*self.salt()+float(beta[np.newaxis])*self.temp()
        return np.asarray(alpha*self.temp+beta*self.salt)

In [19]:
f=open('data/MCH/m1001a.cnv2', 'r')
# data=np.loadtxt('data/MCH/m1001a.cnv2', comments='*')
data = []
i=0
lines=f.readlines()
for line in lines:
    values = line.split()
    i+=1
    if values[0]=='*END*':
        idex=i
print(idex)
for line in lines[idex+1:]:
    values = [float(value) for value in line.split()]
    data.append(values)
np.shape(data)
# line=f.readlines()
# # np.shape(line)
# values=line[0].split()
# type(values[0])
# values[0]
# line[0]
# values = [float(value) for value in line[25].split()]
# type(values[1])
# values

57


(16, 17)

In [15]:
"""(5 points) Test code for the previous function. This cell should NOT give any errors when it is run.
Do not alter the contents of this cell."""

## Test an instance of the class ##
filename1 = 'data/MCH/m1001a.cnv2'
# set up a CTD object for this Filename
ctd1 = CTD(filename1)
# test name
assert 'm1001a' in ctd1.filename
# make sure salt is there
assert ctd1.salt.any()
# make sure temp is there
assert ctd1.temp.any()
# test mean
assert np.allclose(ctd1.salt_mean(), 24.658829411764707)
# test max
assert np.allclose(ctd1.temp_max(), 22.120100000000001)
#test combine
assert np.allclose(ctd1.combine(0, 1).mean(), 24.658829411764707)
assert np.allclose(ctd1.combine(1, 1).mean(), 46.472258823529415)

## Test an instance of the class ##


filename2 = 'data/MCH/m1008b.cnv2'
# set up a CTD object for this Filename
ctd2 = CTD(filename2)
# test name
assert 'm1008b' in ctd2.filename
# make sure salt is there
assert ctd2.salt.any()
# make sure temp is there
assert ctd2.temp.any()
# test mean
assert np.allclose(ctd2.salt_mean(), 34.256788235294117)
# test max
assert np.allclose(ctd2.temp_max(), 21.5411)
assert np.allclose(ctd2.combine(0, 1).mean(), 34.256788235294117)
assert np.allclose(ctd2.combine(1, 1).mean(), 55.666350000000008)

print("Success!")

Success!


### 2 Fiting a line

Use `numpy` polynomial functions to fit random given values `x` and `y` to a line (i.e., first order polynomial), and return the expected value based on that fit at `x=5.0`


In [20]:
def xatfive(x, y):
    'Expected fit at x=5 of linear fit of random data x and y'
    # YOUR CODE HERE
    p = np.polyfit(x, y, 1)
    
    return p[0]*5+p[1]

In [21]:
x = np.array([ 0.29646582,  5.9083115 ,  2.97347063,  0.77284422,  9.39502588,
               2.15227687,  6.1158336 ,  4.56733438,  9.9835841 ,  9.72066327])
y = np.array([  1.18875319,  17.82393043,   9.1241461 ,   2.5398729 ,
               28.48371414,   6.5234553 ,  18.49199616,  13.91623748,
               29.9688738 ,  29.41331221])

assert np.allclose(xatfive(x, y), 15.182532735557906)
print("Success!")

Success!


### 3 Everybody's shuffling

Based on Jake VanderPlas' [wonderful talk](https://speakerdeck.com/jakevdp/statistics-for-hackers), we would like to use a method of bootstrapping to see if two distributions are different. This is the approach, in psudocode:
    
    Take two samples, x and y, of a thing
    calculate and store the difference between the means of x and y
    
    combine x and y in the same pool
    set up a place to store the values we will calculate
    for many, many times:    # optional argument N times, specifically
        shuffle the pool
        pick new sets of xnew and ynew the same size as the originals (no duplicates)
        calculate and store the difference between the means of xnew and ynew

    calculate how many times the difference between the means of x and y 
    is greater than the difference between the means of xnew and ynew.
    
    Estimate the probability, p, based on the number of times greater 
    and the total number of times.
    
    Return the minimum of p and 1-p. (This is similar to a p-value in statistics)
       

In [22]:
def meandiff_prob_bootstrap(x, y, N=10000):
    # YOUR CODE HERE
    xydiff=np.mean(x)-np.mean(y)
    z=np.append(x,y)

    xyndiff = []

    for nn in range(N):
        np.random.shuffle(z)
        xnew=z[:len(x)]
        ynew=z[len(x):]
        xyndiff.append(np.mean(xnew)-np.mean(ynew))

    difftime  = np.sum(xyndiff < xydiff)
    p = difftime / N

    return min(p,1-p)

In [24]:
x = np.array([84, 72, 57, 46, 63, 76, 99, 91])
y = np.array([81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69])

assert np.allclose( meandiff_prob_bootstrap(x, y), 0.165, atol=0.03)
print('Success!')

Success!
