In [1]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('../../data/final/final1.csv', sep=';', encoding='utf8')

In [2]:
newcols = pd.get_dummies(data['NAME'])
names = newcols.columns.values
data[names] = newcols
data = data.drop('NAME', axis=1)

In [3]:
from sklearn.ensemble import RandomForestRegressor

X = data.drop('CALLS', axis=1).values
y = data['CALLS'].values

model = RandomForestRegressor(n_estimators=10, criterion='mse')
model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [6]:
# output line format
from math import ceil
from datetime import datetime
from workalendar.europe import France
import numpy as np
import io

# features

features = ['YEAR', 'MONTH', 'DAY',
            'WEEKDAY', 'WEEKEND', 'HOLIDAY',
            'TIMESLOT', 'DAYSLOT',
            'NAME', 'CALLS']

# French calendar object
calendar = France()

# first and last dates possibly considered
first = datetime.strptime('2011-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
last = datetime.strptime('2013-12-31 23:30:00', '%Y-%m-%d %H:%M:%S')

output_line = u"{datetime}\t{ass_assignment}\t{calls}\r\n"

conv = dict(zip(names, range(26)))

def dummy(name):
    x = np.zeros(26)
    x[conv[name]] = 1
    return x

def parse_date(datestring):
    """ Parse date string and return relevant features. """

    # datetime object encoding data information
    date = datetime.strptime(datestring[:-4], '%Y-%m-%d %H:%M:%S')

    # create date-related predictor

    weekday = date.weekday()
    isweekend = int(weekday >= 5)
    isholiday = int(calendar.is_working_day(date.date()))

    timeslot = int((date - first).total_seconds() / 1800)  # number of half-hours since 01/01/2011
    dayslot = 2 * date.hour + date.minute / 30  # number of half-hours since midnight

    return (date.year, date.month, date.day,
            weekday, isweekend, isholiday, timeslot, dayslot)


def predict(datestring, ass_assignment):
    """ Final prediction method being called.
        Takes the datetime and ass_assignment and predict the number of calls.
    """
    return model.predict(np.hstack([parse_date(datestring), dummy(ass_assignment)]).reshape(1,-1))


def submit(inputfile, outputfile):
    """ Read submission file and make prediction, saving output to new file. """

    with io.open(inputfile, 'r') as input, io.open(outputfile, 'w') as output:
        # copy heading to file
        output.write(input.readline())

        # read each line, predict, and output
        for line in input:
            datetime, ass_assignment, _ = line.split('\t')

            calls = predict(datetime, ass_assignment)[0]

            output.write(output_line.format(datetime=datetime,
                                            ass_assignment=ass_assignment,
                                            calls=calls))

inputfile = '../outputs/submission.txt'
outputfile = '../outputs/output_rf.txt'

submit(inputfile, outputfile)

In [2]:
from sklearn.ensemble import RandomForestRegressor
from time import time

names = data['NAME'].drop_duplicates().sort_values()

models = {}

for name in names:
    t0 = time()
    test = data[data['NAME'] == name].drop('NAME', axis=1)
    X,y = test.drop('CALLS', axis=1).values, test['CALLS'].values
    
    model = RandomForestRegressor(n_estimators=10, criterion='mse')
    model.fit(X,y)
    models[name] = model
    print name, model.score(X,y), time()-t0
print 'Total time:', time()-t0

CAT 0.988925529032 0.46425485611
CMS 0.78171106441 0.224555015564
Crises 0.687733085134 0.127690076828
Domicile 0.983705150388 1.15469384193
Evenements 0.971463524036 0.0434510707855
Gestion 0.753425805423 0.244548082352
Gestion - Accueil Telephonique 0.980479227508 0.651282072067
Gestion Amex 0.866320869304 0.171950101852
Gestion Assurances 0.925886582358 0.436162948608
Gestion Clients 0.842953166586 0.429405927658
Gestion DZ 0.791426868689 0.338741064072
Gestion Relation Clienteles 0.860497032569 0.406984090805
Gestion Renault 0.841323549705 0.133215904236
Japon 0.855591108499 1.06210303307
Manager 0.79335671756 0.610345125198
Mécanicien 0.86531965081 0.470265865326
Médical 0.972354127647 1.16708803177
Nuit 0.968738970322 1.14451408386
Prestataires 0.791765086767 0.184625148773
RENAULT 0.96194072985 1.04368305206
RTC 0.949794239669 0.383384943008
Regulation Medicale 0.83825978076 1.02713108063
SAP 0.941202663543 0.731927871704
Services 0.980008233353 1.15965795517
Tech. Axa 0.9917596

In [4]:
# output line format
from math import ceil
from datetime import datetime
from workalendar.europe import France
import numpy as np

# features

features = ['YEAR', 'MONTH', 'DAY',
            'WEEKDAY', 'WEEKEND', 'HOLIDAY',
            'TIMESLOT', 'DAYSLOT',
            'NAME', 'CALLS']

# French calendar object
calendar = France()

# first and last dates possibly considered
first = datetime.strptime('2011-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
last = datetime.strptime('2013-12-31 23:30:00', '%Y-%m-%d %H:%M:%S')

output_line = "{datetime}\t{ass_assignment}\t{calls}\r\n"

def parse_date(datestring):
    """ Parse date string and return relevant features. """

    # datetime object encoding data information
    date = datetime.strptime(datestring[:-4], '%Y-%m-%d %H:%M:%S')

    # create date-related predictor

    weekday = date.weekday()
    isweekend = int(weekday >= 5)
    isholiday = int(calendar.is_working_day(date.date()))

    timeslot = int((date - first).total_seconds() / 1800)  # number of half-hours since 01/01/2011
    dayslot = 2 * date.hour + date.minute / 30  # number of half-hours since midnight

    return (date.year, date.month, date.day,
            weekday, isweekend, isholiday, timeslot, dayslot)


def predict(datestring, ass_assignment):
    """ Final prediction method being called.
        Takes the datetime and ass_assignment and predict the number of calls.
    """
    return ceil(models[ass_assignment].predict(np.array(parse_date(datestring)).reshape(1,-1)))


def submit(inputfile, outputfile):
    """ Read submission file and make prediction, saving output to new file. """

    with open(inputfile, 'r') as input, open(outputfile, 'w') as output:
        # copy heading to file
        output.write(input.readline())

        # read each line, predict, and output
        for line in input:
            datetime, ass_assignment, _ = line.split('\t')

            calls = predict(datetime, ass_assignment)

            output.write(output_line.format(datetime=datetime,
                                            ass_assignment=ass_assignment,
                                            calls=calls))

inputfile = './submission.txt'
outputfile = './output_rf.txt'

submit(inputfile, outputfile)

In [22]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

labels = LabelEncoder()
labels.fit(np.arange(7))

LabelEncoder()

In [23]:
labels.transform(6)

6

In [29]:
numerical_encoder(1,7,drop0=True)

[1, 0, 0, 0, 0, 0]

In [24]:
def numerical_encoder(val, n, drop0=False):
    
    ret = [0]*n
    ret[val] = 1
    if drop0:
        del ret[0]
    return ret
        
    

def parse_date(datestring):
    
    date = datetime.strptime(datestring[:-4], '%Y-%m-%d %H:%M:%S')

    # create date-related predictor
    year = date.year
    month = numerical_encoder(date.month-1, 12, drop0=True)
    day = numerical_encoder(date.day-1, 12, drop0=True)
    
    weekday = numerical_encoder(date.weekday(), 7, drop0=True)
    isweekend = int(weekday >= 5)
    isholiday = int(calendar.is_working_day(date.date()))

    timeslot = int((date - first).total_seconds() / 1800)  # number of half-hours since 01/01/2011 00:00
    dayslot = 2*date.hour + date.minute/30  # number of half-hours since midnight
    dayslot = numerical_encoder(dayslot, 48, drop0=True)
    
    return [timeslot, date.year, date.month, date.day,
            dayslot, isweekend, isholiday] + weekday

date_encoder('2011-01-01 00:00:00.000')

NameError: name 'date_encoder' is not defined