In [1]:
import numpy as np
import re
import math 
import scipy.stats as stats
from scipy.optimize import minimize

In [2]:
def get_data(datafile):
    """COPIED FROM MORIARTY AND TWEAKED
    N         : number of experiments (columns in the table)
    G         : number of genes (rows in the table)
    X[i]      : array of time points, in hrs, for the N experiments
    S_true[i] : array of sigmas for the experiments
    Y[i][t]   : GxN: observed tpm for gene i, time point t"""
    with open(datafile) as f:
        # First header line gives us the time points
        fields = f.readline().split()
        X = []
        for s in fields:
            match = re.search(r'^(\d+)hr', s)
            X.append(int(match.group(1)))
        X = np.array(X)
        N = len(X)

        # Second header line gives us "gene" followed by +=SD's
        fields = f.readline().split()
        S_true = np.zeros(N)
        for i,s in enumerate(fields[1:]):
            match = re.search(r'^\+-(\d+)', s)
            S_true[i] = float(match.group(1))

        # Third header line is just ------ stuff
        f.readline()

        # Remaining lines are data
        genenames = []
        Y = []
        for line in f.readlines():
            fields = line.split()
            genenames.append(fields[0])
            Y.append( np.array( [ float(s) for s in fields[1:]] ))
        G = len(Y)
        
    return X, S_true, Y

def calc_mean_for_normal(a, b, phi, t):
    """Given paramters a, b, and phi, plus a time point t, calculates the mean for the gaussian according to
    the model from MCB112 PSET 7"""
    rads = 2*math.pi*(1/24)*(t+phi)
    mean = b + a*math.sin(rads)
    
    return mean
    

def calc_nll(params, data_vec, sigma_vec, time_vec):
    """Given a numpy array of paramters in the order [a, b, phi], as well as a vector of the data, a vector
    of the sigmas for each data point, and a vector of the time of each data point, calculates the negative 
    log likelyhood """
    a, b, phi = params[0], params[1], params[2]
    log_probs_list = []
    for z, x_z in enumerate(data_vec):
        mean = calc_mean_for_normal(a, b, phi, time_vec[z])
        sigma = sigma_vec[z]
        log_probs_list.append(stats.norm.logpdf(x_z, mean, sigma))
        
    nll = -sum(log_probs_list)
    
    return nll
        

In [3]:
data_list = get_data("w07-data.tbl")

In [7]:
time_arr, sigma_arr, data_arr = data_list[0], data_list[1], data_list[2]
a0, b0, phi0 = 1, 1, 1
start_params = np.array([a0,b0,phi0])

test = calc_nll([a0, b0, phi0], data_arr[0], sigma_arr, time_arr)
test_2 = minimize(calc_nll, start_params, (data_arr[0], sigma_arr, time_arr))

In [8]:
test_2

      fun: 29.60821751108932
 hess_inv: array([[ 5.95189068, -2.92619351,  0.11507657],
       [-2.92619351,  4.66253345, -0.55354949],
       [ 0.11507657, -0.55354949,  0.154222  ]])
      jac: array([-9.53674316e-07, -2.38418579e-06, -3.81469727e-06])
  message: 'Optimization terminated successfully.'
     nfev: 85
      nit: 13
     njev: 17
   status: 0
  success: True
        x: array([24.30624777, 41.08230173,  0.32553572])

In [9]:
test_3 = minimize(calc_nll, start_params, (data_arr[0], sigma_arr, time_arr))

In [10]:
test_3

      fun: 29.60821751108932
 hess_inv: array([[ 5.95189068, -2.92619351,  0.11507657],
       [-2.92619351,  4.66253345, -0.55354949],
       [ 0.11507657, -0.55354949,  0.154222  ]])
      jac: array([-9.53674316e-07, -2.38418579e-06, -3.81469727e-06])
  message: 'Optimization terminated successfully.'
     nfev: 85
      nit: 13
     njev: 17
   status: 0
  success: True
        x: array([24.30624777, 41.08230173,  0.32553572])