In [2]:
## LABEL DATA

import pandas as pd
import numpy as np

labels = pd.read_csv('dataset/survey/PHQ-9.csv')

# there are 38 post responses and 46 pre responses
labelsproc = np.empty(38, dtype=np.int64)


# takes THE PHQ-9.csv and a zeroed array, populates zeroed array
# with depression scores
def surveyIntoDepressionScore(X, y):
    
    arr = np.ones((38,9))
    colLabels = X.columns.values.tolist()
    
    # converts every survey answer into scores
    # populates arr with scores
    for i in range(0,38):
        for j in range(2,11):
            arr[i,j-2] = textToScore(X[colLabels[j]][46+i])
        
    sumArr = np.sum(arr, axis=1)
        
    return sumArr;

# returns dictionary with uids as keys and PHQ9 scores as values
def surveyIntoDepressionScoreDict(X, y):
    
    arr = np.ones((38,9))
    colLabels = X.columns.values.tolist()
    
    # converts every survey answer into scores
    # populates arr with scores
    for i in range(0,38):
        for j in range(2,11):
            arr[i,j-2] = textToScore(X[colLabels[j]][46+i])
        
    sumArr = np.sum(arr, axis=1)
    
    labelDict = {}
    for i in range(0,38):
        labelDict[str(X['uid'][i+46])] = sumArr[i]
    
    return labelDict;
    

def textToScore(str):
    if str == 'Not at all':
        return 0;
    elif str == 'Several days':
        return 1;
    elif str == 'More than half the days':
        return 2;
    elif str == 'Nearly every day':
        return 3;
    

labelsproc.size
labelVector = surveyIntoDepressionScore(labels,labelsproc)
# print(labelVector)
# prints out label vector containing depression scores of 
# participants who replied to the post PHQ-9




## GPS DATA

import pandas as pd
import numpy as np
import math

# 1->'01' 25->'25'
def doubleDigitizer(x):
    if x < 10:
        return '0' + str(x);
    else:
        return str(x);

# to identify renegades
renegade_vectors = np.array([])

    
# here are 60 users from gps_u00 to gps_u59
# we populate df_collection with every gps data for every user
df_collection = {}
for i in range(0,60):
    ddi = doubleDigitizer(i)
    try:
        df_collection[ddi] = pd.read_csv('dataset/sensing/gps/gps_u' + ddi + '.csv')
    except IOError:
        print "There aint no user # " + ddi + " for GPS"
        renegade_vectors = np.append(renegade_vectors,ddi)
        pass
        

        

def distBetweenTwoCoords(lat1,lon1,lat2,lon2):
    
    R = 6371 # Radius of Earth in km

    dLat = math.radians(float(lat2) - float(lat1))
    dLon = math.radians(float(lon2) - float(lon1))
    lat1 = math.radians(float(lat1))
    lat2 = math.radians(float(lat2))
    
    a = math.sin(dLat/2) * math.sin(dLat/2) + \
        math.cos(lat1) * math.cos(lat2) * math.sin(dLon/2) * math.sin(dLon/2)

    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

    d = R * c # if want miles multiply by 0.621371

    return d;

# lets test it out. internet says its 45 meters.
# distBetweenTwoCoords(43.7591346,-72.3292405, 43.759503,-72.32901795)



#assumes input is DataFrame, with labeled latitute and longitude columns

def totalDistCovered(X):
    
    sum = 0
    
    lats = X['latitude']
    lons = X['longitude']
    
    for i in range(0,(lats.size-1)):
        sum += distBetweenTwoCoords(lats[i],lons[i],lats[i+1],lons[i+1])
                
    return sum;

# print("Total distance covered in meters is: " + str(totalDistCovered(gps_u00)))

gpsFeatureVector = np.zeros((60,1))

for i in range(0,60):
    ddi = doubleDigitizer(i)
    try:
        gpsFeatureVector[i,0] = totalDistCovered(df_collection[ddi])
    except KeyError:
        gpsFeatureVector[i,0] = np.NaN
    
# takes a couple seconds
# print(gpsFeatureVector)

## TEXTDATA

import pandas as pd
import numpy as np
import math

# 1->'01' 25->'25'
def doubleDigitizer(x):
    if x < 10:
        return '0' + str(x);
    else:
        return str(x);

# here are 60 users from gps_u00 to gps_u59
# we populate df_collection with every gps data for every user
df_collection = {}
for i in range(0,60):
    ddi = doubleDigitizer(i)
    try:
        df_collection[ddi] = pd.read_csv('dataset/sms/sms_u' + ddi + '.csv')
    except IOError:
        pass
        print "There aint no user # " + ddi + " for SMS"
        
from datetime import datetime

def timeStampToDatetime(ts):
    return datetime.utcfromtimestamp(float(ts))

# outputs datetime.datetime(2013, 3, 24, 4, 31, 22)
# that's for year, month, day, hour, minute, second

# int1 is first time, int2 is last time, returns days elapsed in between
def daysInBetween(int1, int2):
    secs = int2 - int1
    return secs/86400;

# internet says 66
# daysInBetween(1364099009, 1369833786)

def dailySmsFreq(X):
    ts = X['timestamp']
    return (ts.size)/daysInBetween(ts[0], ts[ts.size-1]);

# dailySmsFreq(df_collection['00'])

smsFeatureVector = np.zeros((60,1))

for i in range(0,60):
    ddi = doubleDigitizer(i)
    try:
        smsFeatureVector[i,0] = dailySmsFreq(df_collection[ddi])
    except KeyError:
        smsFeatureVector[i,0] = np.NaN
    
# takes a couple seconds
# print(smsFeatureVector)


## CALLLOG DATA

import pandas as pd
import numpy as np
import math

# 1->'01' 25->'25'
def doubleDigitizer(x):
    if x < 10:
        return '0' + str(x);
    else:
        return str(x);

# here are 60 users from gps_u00 to gps_u59
# we populate df_collection with every gps data for every user
df_collection = {}
for i in range(0,60):
    ddi = doubleDigitizer(i)
    try:
        df_collection[ddi] = pd.read_csv('dataset/call_log/call_log_u' + ddi + '.csv')
    except IOError:
        pass
        print "There aint no user # " + ddi + " for CALL"
        
from datetime import datetime

def timeStampToDatetime(ts):
    return datetime.utcfromtimestamp(float(ts))

# outputs datetime.datetime(2013, 3, 24, 4, 31, 22)
# that's for year, month, day, hour, minute, second

# int1 is first time, int2 is last time, returns days elapsed in between
def daysInBetween(int1, int2):
    secs = int2 - int1
    return secs/86400;

def dailyCallFreq(X):
    ts = X['timestamp']
    return (ts.size)/daysInBetween(ts[0], ts[ts.size-1]);

# dailyCallFreq(df_collection['00'])

callFeatureVector = np.zeros((60,1))

for i in range(0,60):
    ddi = doubleDigitizer(i)
    try:
        callFeatureVector[i,0] = dailyCallFreq(df_collection[ddi])
    except KeyError:
        callFeatureVector[i,0] = np.NaN
    
# takes a couple seconds
# print(callFeatureVector)





a1 = gpsFeatureVector
a2 = smsFeatureVector
a3 = callFeatureVector


There aint no user # 06 for GPS
There aint no user # 11 for GPS
There aint no user # 21 for GPS
There aint no user # 26 for GPS
There aint no user # 28 for GPS
There aint no user # 29 for GPS
There aint no user # 37 for GPS
There aint no user # 38 for GPS
There aint no user # 40 for GPS
There aint no user # 48 for GPS
There aint no user # 55 for GPS
There aint no user # 06 for SMS
There aint no user # 11 for SMS
There aint no user # 21 for SMS
There aint no user # 26 for SMS
There aint no user # 28 for SMS
There aint no user # 29 for SMS
There aint no user # 37 for SMS
There aint no user # 38 for SMS
There aint no user # 40 for SMS
There aint no user # 48 for SMS
There aint no user # 55 for SMS
There aint no user # 06 for CALL
There aint no user # 11 for CALL
There aint no user # 21 for CALL
There aint no user # 26 for CALL
There aint no user # 28 for CALL
There aint no user # 29 for CALL
There aint no user # 37 for CALL
There aint no user # 38 for CALL
There aint no user # 40 for CALL

In [112]:
## MACHINE LEARNING

featureMatrix = np.hstack((a1,a2,a3))

## KNN

from sklearn.metrics import accuracy_score
import operator
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)

# elegant but overruled by practicality
# x = np.all(np.isfinite(featureMatrix), axis=1)
# arrind = np.where(x == False)
# print(featureMatrix[np.all(np.isfinite(featureMatrix), axis=1)])


#print(renegade_vectors)

# ugly hack for removing NULLs
renegade_vectors = np.array([6,8,11,12,13,21,22,25,26,28,29,37,38,39,40,41,46,48,50,54,55,57])

for i in range(0,renegade_vectors.size):
    featureMatrix = np.delete(featureMatrix, int(renegade_vectors[i])-i, 0)
    pass



cutoff = 16

# shuffles matrix
np.random.shuffle(featureMatrix)

train = featureMatrix[0:cutoff]
test = featureMatrix[cutoff:38]

trainl = labelVector[0:cutoff]
testl = labelVector[cutoff:38]

knn.fit(train, trainl)
ypred = knn.predict(test)


# knn.fit(test, testl)
# ypred = knn.predict(train)
# print(accuracy_score(trainl, ypred))


# LOGREG

# acc = 0
# for i in range(0,37-cutoff):
#     if testl[i] == knn.predict(test[i].reshape(1, -1)):
#         acc += 1
        
# result = float(acc/(38-cutoff))



# print(result)

print(accuracy_score(testl, ypred))

print(1-(float(abs(sum(map(operator.sub,testl,ypred))))/float(sum(testl))))

0.136363636364
0.394366197183


In [204]:
from sklearn import metrics


cutoff = 30

train = featureMatrix[0:cutoff]
test = featureMatrix[cutoff:38]

trainl = labelVector[0:cutoff]
testl = labelVector[cutoff:38]


# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(train, trainl)

# predict the response values for the observations in X
y_pred = logreg.predict(test)
print(metrics.accuracy_score(testl, y_pred))


## all data
y_pred2 = knn.predict

0.0


In [119]:
def averageItOut():
    avg1 = 0
    avg2 = 0
    cutoff = 16

    for i in range(0,100):
    # shuffles matrix
        np.random.shuffle(featureMatrix)

        train = featureMatrix[0:cutoff]
        test = featureMatrix[cutoff:38]

        trainl = labelVector[0:cutoff]
        testl = labelVector[cutoff:38]

        knn.fit(train, trainl)
        ypred = knn.predict(test)
        
        avg1 += accuracy_score(testl, ypred)

        avg2 += 1-(float(abs(sum(map(operator.sub,testl,ypred))))/float(sum(testl)))
    
    return avg1/100, avg2/100

averageItOut()

(0.070909090909090977, 0.48654929577464784)

In [118]:
a = 0
a += 0.13
a

0.13