In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as nr
from sklearn.model_selection import GridSearchCV
from sentinels import NOTHING
from sklearn.model_selection import train_test_split

def train(df, features, nFeatures, targetedClass, classifier):
    
    nRows = df.shape[0]
    
    X = df[features].as_matrix().reshape(nRows, nFeatures)
    
    Y = df[targetedClass].as_matrix().ravel()
    
    trainedClassifier = classifier.fit(X, Y)
    
    return trainedClassifier

def test(df, features, nFeatures, targetedClass, trainedClassifier):
    
    nRows = df.shape[0]
    
    X = df[features].as_matrix().reshape(nRows, nFeatures)
    
    df.predicted = trainedClassifier.predict(X)
    
    Y = df[targetedClass].as_matrix().ravel()
    print('Score: ', trainedClassifier.score(X, Y))
    
    return df
    

def treeClassification(df, targetedClass, maxDepthParameter = None):
    from sklearn import tree
    
    if maxDepthParameter == None:
        maxDepthValues = []
        for i in range(1, 30):
            maxDepthValues.append(i)
            
        algorithmParameters = {'max_depth':maxDepthValues}
        model = tree.DecisionTreeClassifier()


        grid = GridSearchCV(model, algorithmParameters)
        grid.fit(df, df[targetedClass])

        #print(grid.best_score_)
        print('maxDepthParameter: ', grid.best_estimator_.max_depth)
        maxDepthParameter = grid.best_estimator_.max_depth
    
    treeClassifier = tree.DecisionTreeClassifier(max_depth = maxDepthParameter)
    
    return treeClassifier


def logisticRegression(df, targetedClass, cParameter = None):
    from sklearn import linear_model
    
    if cParameter == None:
        cValues = []
        for i in range(1, 100):
            cValues.append(i*0.1)
            
        algorithmParameters = {'C':cValues}
        model = linear_model.LogisticRegression()


        grid = GridSearchCV(model, algorithmParameters)
        grid.fit(df, df[targetedClass])

        #print(grid.best_score_)
        print('cParameter: ', grid.best_estimator_.C)
        cParameter = grid.best_estimator_.C
        
    logisticRegressionClassifier = linear_model.LogisticRegression(C = cParameter)
    
    return logisticRegressionClassifier


def SVMClassification(df, targetedClass, cParameter = None, kernelParameter = None):
    from sklearn import svm
    
    if cParameter == None and kernelParameter == None:
        
        cValues = []
        for i in range(1, 10):
            cValues.append(0.01*3**i)
            
        kernelValues = ['linear', 'poly', 'rbf']
            
        algorithmParameters = {'C':cValues, 'kernel':kernelValues}
        model = svm.SVC()

        grid = GridSearchCV(model, algorithmParameters)
        grid.fit(df, df[targetedClass])

        #print(grid.best_score_)
        print('C: ', grid.best_estimator_.C)
        print('Kernel: ', grid.best_estimator_.kernel)
        cParameter = grid.best_estimator_.C
        kernelParameter = grid.best_estimator_.kernel
    
    elif cParameter == None:
        cValues = []
        for i in range(1, 40):
            cValues.append(0.001 * 1.5**i)
            
        algorithmParameters = {'C':cValues}
        model = svm.SVC(kernel = kernelParameter)


        grid = GridSearchCV(model, algorithmParameters)
        grid.fit(df, df.target)

        #print(grid.best_score_)
        print('cParameter: ', grid.best_estimator_.C)
        cParameter = grid.best_estimator_.C
        
    elif kernelParameter == None:
        kernelValues = ['linear', 'poly', 'rbf']
            
        algorithmParameters = {'kernel':kernelValues}
        model = svm.SVC(C = cParameter)

        grid = GridSearchCV(model, algorithmParameters)
        grid.fit(df, df[targetedClass])

        #print(grid.best_score_)
        print('kernelParameter: ', grid.best_estimator_.kernel)
        kernelParameter = grid.best_estimator_.kernel
    
    SVMClassifier = svm.SVC(C = cParameter, kernel = kernelParameter)

    return SVMClassifier




def forestClassification(df, targetedClass, nEstimatorsParameter = None, maxDepthParametar = NOTHING, maxLeafNodesParameter = NOTHING, minSamplesLeafParametar  = None):
    
    from sklearn import ensemble
    
    algorithmParameters = dict()
    
    if maxLeafNodesParameter == NOTHING:
        maxLeafNodesValues = []
        for i in range(1,15):
            maxLeafNodesValues.append(2**i)
            
        maxLeafNodesValues.append(None)
            
        algorithmParameters['max_leaf_nodes'] = maxLeafNodesValues
        
    if nEstimatorsParameter == None:
        nEstimatorsValues = []
        for i in range(1, 20):
            nEstimatorsValues.append(2*i)
            
        algorithmParameters['n_estimators'] = nEstimatorsValues
        
    if maxDepthParametar == NOTHING:
        maxDepthValues = []
        for i in range(1, 20):#vratit na 30
            maxDepthValues.append(2*i)
            
        algorithmParameters['max_depth'] = maxDepthValues
        
    if minSamplesLeafParametar == None:
        minSamplesLeafValues = []
        for i in range(1, 10):
            minSamplesLeafValues.append(2*i)
            
        algorithmParameters['min_samples_leaf'] = minSamplesLeafValues
        
    if maxLeafNodesParameter == NOTHING:
        tempMaxLeafNodesParameter = None
    else:
        tempMaxLeafNodesParameter = maxLeafNodesParameter
        
    if nEstimatorsParameter == None:
        tempNEstimatorsParameter = 10
    else:
        tempNEstimatorsParameter = nEstimatorsParameter
        
    if maxDepthParametar == NOTHING:
        tempMaxDepthParametar = None
    else:
        tempMaxDepthParametar = maxDepthParametar
        
    if minSamplesLeafParametar == None:
        tempMinSamplesLeafParametar = 1
    else:
        tempMinSamplesLeafParametar = minSamplesLeafParametar
        
    
    model = ensemble.RandomForestClassifier(max_leaf_nodes = tempMaxLeafNodesParameter, 
                                            n_estimators = tempNEstimatorsParameter, max_depth = tempMaxDepthParametar,
                                            min_samples_leaf = tempMinSamplesLeafParametar)

    grid = GridSearchCV(model, algorithmParameters)
    grid.fit(df, df[targetedClass])

    #print(grid.best_score_)
    if nEstimatorsParameter == None:
        print('nEstimatorsParameter: ', grid.best_estimator_.n_estimators)
        nEstimatorsParameter = grid.best_estimator_.n_estimators
        
    if maxLeafNodesParameter == NOTHING:  
        #print(grid.best_score_)
        print('maxLeafNodesParameter: ', grid.best_estimator_.max_leaf_nodes)
        maxLeafNodesParameter = grid.best_estimator_.max_leaf_nodes
        
        
    if maxDepthParametar == NOTHING:
        print('maxDepthParametar: ', grid.best_estimator_.max_depth)
        maxDepthParametar = grid.best_estimator_.max_depth
        
    if minSamplesLeafParametar == None:   
        #print(grid.best_score_)
        print('minSamplesLeafParametar: ', grid.best_estimator_.min_samples_leaf)
        minSamplesLeafParametar = grid.best_estimator_.min_samples_leaf
    
    forestClassifier = ensemble.RandomForestClassifier(max_leaf_nodes = maxLeafNodesParameter, n_estimators = nEstimatorsParameter,
                                                      max_depth = maxDepthParametar, min_samples_leaf = minSamplesLeafParametar)
    
    return forestClassifier

def intToColor(i):
    if(i == 0):
        return 'blue'
    elif(i == 1):
        return 'green'
    elif(i == 2):
        return 'red'
    elif(i == 3):
        return 'cyan'
    elif(i == 4):
        return 'magenta'
    elif(i == 5):
        return 'yellow'
    elif(i == 6):
        return 'black'
    elif(i == 7):
        return 'darkviolet'
    elif(i == 8):
        return 'pink'


def plotClassification(df, targetedClass, feature, nClasses):

    true = []
    false = []
    
    for i in range(nClasses):
        true.append( df[( (df.predicted == i) & (df[targetedClass] == i))] )
        false.append( df[((df.predicted == i) & (df[targetedClass] != i))] )
    
    for i in range(nClasses):
        print(true[i][feature].count() / (true[i][feature].count() + false[i][feature].count()))
        
    fig = plt.figure(figsize = (8, 8))

    ax = fig.gca()
    
    for i in range(nClasses):
        color = intToColor(i)
        if true[i][feature].count() != 0:
            true[i].plot(kind = 'scatter', x = 'feat_1', y = 'feat_13', ax = ax, color = color, marker = 'o', alpha = 0.2)
        if false[i][feature].count() != 0:
            false[i].plot(kind = 'scatter', x = 'feat_1', y = 'feat_13', ax = ax, color = color, marker = 'x', alpha = 0.2)

    plt.show()


In [2]:
df = pd.read_csv('C:\\Users\\viktor\\Downloads\\ottoTest\\train.csv')
print(df.head())

   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0   1       1       0       0       0       0       0       0       0       0   
1   2       0       0       0       0       0       0       0       1       0   
2   3       0       0       0       0       0       0       0       1       0   
3   4       1       0       0       1       6       1       5       0       0   
4   5       0       0       0       0       0       0       0       0       0   

    ...     feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \
0   ...           1        0        0        0        0        0        0   
1   ...           0        0        0        0        0        0        0   
2   ...           0        0        0        0        0        0        0   
3   ...           0        1        2        0        0        0        0   
4   ...           1        0        0        0        0        1        0   

   feat_92  feat_93   target  
0        0        0

In [3]:
df['target'] = df['target'].astype('category').cat.codes
features = []
nFeatures = 93
for i in range(1, nFeatures + 1):
	parameter = 'feat_' + str(i)
	features.append(parameter)

dfTrain, dfTest = train_test_split(df, train_size = 0.2)  

In [4]:
classifier = logisticRegression(dfTrain, 'target', cParameter = 8.5)#rezultat 0.74-0.75 neovisno o C
trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

#plotClassification(dfTest, 'target', 'feat_1', 9)

Score:  0.74920711876


In [5]:
classifier = SVMClassification(dfTrain, 'target', cParameter = 2.2, kernelParameter = "rbf") #best c for default kernel
trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.75429771933


In [6]:
classifier = SVMClassification(dfTrain, 'target', 1, "rbf") # default values for kernel and c
trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.742904470436


In [7]:
classifier = SVMClassification(dfTrain, 'target', cParameter = 0.0113, kernelParameter = "linear")#best c for linear kernel
trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.752055430984


In [8]:
classifier = treeClassification(dfTrain, 'target', maxDepthParameter = 11) 
trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.634749409127


In [9]:
classifier = forestClassification(dfTrain, 'target', nEstimatorsParameter = 10, maxDepthParametar =  None, 
                                  maxLeafNodesParameter = None, minSamplesLeafParametar = 1) #default values

trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

#plotClassification(dfTest, 'target', 'feat_1', 9)

Score:  0.741914631436


In [10]:
classifier = forestClassification(dfTrain, 'target', nEstimatorsParameter = 10,
                                 maxLeafNodesParameter = 16384, maxDepthParametar = 26
                                , minSamplesLeafParametar = 2) #1 default value

trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.744419530129


In [11]:
classifier = forestClassification(dfTrain, 'target', nEstimatorsParameter = 40, maxDepthParametar = None,
                                 maxLeafNodesParameter = None, minSamplesLeafParametar = 2) #2 default values
trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.765145546735


In [12]:
classifier = forestClassification(dfTrain, 'target', nEstimatorsParameter = 34, maxDepthParametar =  None, 
                                  maxLeafNodesParameter = None, minSamplesLeafParametar = 1) #3 default values

trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)

dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.766781811203


In [13]:
classifier = forestClassification(dfTrain, 'target', nEstimatorsParameter = 40, #ideal hyperparameters
                        maxLeafNodesParameter = None, maxDepthParametar = 38
                        , minSamplesLeafParametar = 2)

trainedClassifier = train(dfTrain, features, nFeatures, 'target', classifier)
dfTest = test(dfTest, features, nFeatures, 'target', trainedClassifier)

Score:  0.763468880674
