# #1. Multi-Class Classification:

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn import datasets, linear_model
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Run every dataset(5 of them) through multiple models, with different parameters to find best model
best_estimators = {'knn_best': [],
                   'svm_best': [],
                   'linreg_best': []}
for i in range(1,7):
    # put data in pandas dataframe
    data = 'TrainData{}.txt'.format(i, i)
    label = 'TrainLabel{}.txt'.format(i, i)
    X = pd.read_csv(data, sep='\s+', header=None)
    y = pd.read_csv(label, header=None)
    print()
    print('''*** Dataset {} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    print('Number of Features: ' + str(X.shape[1]))
    print('Classes: ' + str(y[0].unique()))
    
    # fill missing values
    if X.isnull().any().any():
        # change to nan
        X = X[X < 1e99]
        # fill linear-ly
        X = X.interpolate()
        # fill outside values with mean
        X = X.fillna(X.mean())

    # split into 90% train and 10% test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=20)
    
    #Knn
    knn = KNeighborsClassifier()
    params_knn = {'n_neighbors': np.arange(5, 15)}
    knn_gs = GridSearchCV(knn, params_knn, cv=5)
    knn_gs.fit(X_train, y_train.values.flatten())
    
    #LinearRegression
    linreg = LinearRegression()
    linreg.fit(X_train, y_train.values.flatten())
    print(linreg.score(X_test, y_test))
    
    
    # SVM
    svm = LinearSVC()
    svm.fit(X_train, y_train.values.flatten())
    print(svm.score(X_test, y_test))
   
    
    # save best model
    best_estimators['knn_best'].append(knn_gs.best_estimator_)
    best_estimators['linreg_best'].append(linreg)
    best_estimators['svm_best'].append(svm)

    

    print()
    print(knn_gs.best_params_)
    print('knn: {}'.format(best_estimators['knn_best'][i-1].score(X_test, y_test)))
    
    print()
    print('linreg: {}'.format(best_estimators['linreg_best'][i-1].score(X_test, y_test)))
    
    print()
    print('svm: {}'.format(best_estimators['svm_best'][i-1].score(X_test, y_test)))
    


*** Dataset 1 ***
Number of Samples: 150
Number of Features: 3312
Classes: [1 2 4 3 5]




-0.5052099374787664
0.8666666666666667

{'n_neighbors': 5}
knn: 0.8666666666666667

linreg: -0.5052099374787664

svm: 0.8666666666666667

*** Dataset 2 ***
Number of Samples: 100
Number of Features: 9182
Classes: [ 1  2  3  4  5  6  7  8  9 10 11]




0.1338721701107315




0.8

{'n_neighbors': 5}
knn: 0.5

linreg: 0.1338721701107315

svm: 0.8

*** Dataset 3 ***
Number of Samples: 6300
Number of Features: 13
Classes: [9 1 8 6 2 4 7 5 3]
0.021817505826693617




0.2873015873015873

{'n_neighbors': 14}
knn: 0.33174603174603173

linreg: 0.021817505826693617

svm: 0.2873015873015873

*** Dataset 4 ***
Number of Samples: 2547
Number of Features: 112
Classes: [1 2 3 4 5 6 7 8 9]
0.7795576185903385




0.5215686274509804

{'n_neighbors': 5}
knn: 0.7333333333333333

linreg: 0.7795576185903385

svm: 0.5215686274509804

*** Dataset 5 ***
Number of Samples: 1119
Number of Features: 11
Classes: [5 6 7 4 8 3]




0.4847238516155669




0.5982142857142857

{'n_neighbors': 14}
knn: 0.4732142857142857

linreg: 0.4847238516155669

svm: 0.5982142857142857

*** Dataset 6 ***
Number of Samples: 612
Number of Features: 142
Classes: [  925000.  2250000.  8000000.  3500000.  1750000.  1500000.   950000.
   842500.  1250000.   800000.   600000.  1000000.   680000.   590000.
   650000.  5000000.  5250000.  1600000.   717500.   900000.  3750000.
   792500.  6000000.  5500000.   724500.   725000.  4000000.  4250000.
   742500.   750000.  4500000.  3000000.   630000.  2200000.  1300000.
   700000.  1200000.  2500000.  1050000.  4750000.   767500.  3100000.
  2350000.  6750000.  2600000.   667500.  2100000.  7000000.  3900000.
   625000.  2750000.  3400000.   575000.  5750000.   715000.  3250000.
   825000.  3575000.  5850000.   660000. 13800000.   727500.  7450000.
  5400000.   735000.   640000.   892500.  7250000. 10900000.   832500.
   692500.   705000.  1850000.   740000.  9000000.  2950000.   875000.
   675000.  1100000.  38500



0.4936204763073814
0.016129032258064516

{'n_neighbors': 14}
knn: 0.12903225806451613

linreg: 0.4936204763073814

svm: 0.016129032258064516




# Missing Value Estimation:

In [1]:
import pandas as pd
import numpy as np

In [2]:
def weightedKnnImpute(X, k):
    #Input: X:= pandas dataframe with uknown values set to NaN
    #       k:= number of nearest neighbors sought
    
    numberOfFeatures = X.shape[1]
    
    for index1, row1 in X.iterrows():
        #track progress
        print("row: " + str(index1))
        
        for i in range(numberOfFeatures):
            if np.isNaN(row1[i]):

                #calculate distances
                distArray = []
                for index2, row2 in X.iterrows():
                    distance = 0
                    if(index1 != index2):
                        for j in range(numberOfFeatures):
                            if( not np.isNaN(row1[j]) and not np.isNaN(row2[j])):
                                distance += np.square(X.loc[index1, j] - row2[j])

                        #invert distance
                        distArray.append(1 / np.sqrt(distance))
                    else:
                        distArray.append(0)

                #find nearest neighbors and weight their values 
                result = 0
                values = []
                totalWeight = 0
                weights = []
                knn = k
                
                for l in range(knn):
                    argmax = np.argmax(distArray)
                    if not np.isNaN(X.loc[argmax, i]):
                        values.append(X.loc[argmax, i])
                        totalWeight += distArray[argmax]
                        weights.append(distArray[argmax])
                        distArray[argmax] = 0
                    else:
                        distArray[argmax] = 0
                        knn += 1
   
                for m in range(len(values)):
                    result += values[m] * (weights[m] / totalWeight)

                row1[i] = result
                
    return X
    

In [3]:
for i in range(1,4):
    # put data in pandas dataframe
    data = 'MissingData{}.txt'.format(i)
    X = pd.read_csv(data, sep='\s+', header=None)
    
    X = X.transpose()
    
    # header
    print('\n''*** MissingData{} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    numberOfFeatures = X.shape[1]
    print('Number of Features: ' + str(X.shape[1]))
    print("\nOriginal Data: \n")

    # set uknowns in dataset to NaN
    X = X[X < 1.00000000000000e+99]
    print(str(X))
    
    Xresult = weightedKnnImpute(X, 4)
    
    Xresult = Xresult.transpose()
    
    np.savetxt("Q2_MissingResults{}.txt".format(i), Xresult.values, fmt='%f', delimiter="\t")
    


*** MissingData1 ***
Number of Samples: 14
Number of Features: 242

Original Data: 

     0     1     2     3     4     5     6     7     8     9    ...   232  \
0  -0.11 -0.30  0.50  0.00  0.40  0.39  0.50 -0.52 -0.87 -1.39  ...  0.19   
1   0.02 -0.37  0.18 -0.11 -0.16 -0.18 -0.24 -0.48 -0.17 -0.57  ... -0.27   
2  -0.36 -0.18  0.41   NaN  0.31 -0.09 -0.16 -0.73  0.10 -0.16  ... -0.43   
3  -0.11 -0.09   NaN  0.19 -0.34 -0.04   NaN -0.42 -0.12 -0.02  ... -0.12   
4   0.48 -0.16   NaN  0.00  0.02 -0.25  0.04 -0.40   NaN  0.30  ... -0.07   
5  -0.20 -0.16  0.15 -0.07  0.21 -0.69   NaN -0.01  0.52  0.41  ...  0.21   
6   0.27 -0.10 -0.25  0.18 -0.70 -0.64 -0.11 -0.34 -0.28  0.03  ... -0.01   
7   0.29 -0.09 -0.41  0.18  0.11 -0.38  0.10  0.55  0.50  0.50  ...  0.24   
8  -0.34  0.46 -0.07 -0.23  0.08  0.41 -0.04   NaN  0.09  0.48  ...  0.04   
9  -0.05  0.32 -0.13 -0.16  0.05  0.33 -0.01  0.37 -0.01  0.51  ...  0.01   
10  0.23  0.02 -0.15  0.03 -0.09 -0.62 -0.04 -0.29  0.08 -0.11  ...

row: 1
row: 2
row: 3
row: 4
row: 5
row: 6
row: 7
row: 8
row: 9
row: 10
row: 11
row: 12
row: 13
row: 14
row: 15
row: 16
row: 17
row: 18
row: 19
row: 20
row: 21
row: 22
row: 23
row: 24
row: 25
row: 26
row: 27
row: 28
row: 29
row: 30
row: 31
row: 32
row: 33
row: 34
row: 35
row: 36
row: 37
row: 38
row: 39
row: 40
row: 41
row: 42
row: 43
row: 44
row: 45
row: 46
row: 47
row: 48
row: 49

*** MissingData3 ***
Number of Samples: 79
Number of Features: 273

Original Data: 

          0          1          2          3          4          5    \
0   10.145677  11.000000  11.861707  11.999000  10.000000  11.999000   
1         NaN        NaN        NaN  10.000000        NaN        NaN   
2         NaN        NaN        NaN        NaN        NaN        NaN   
3    8.646739   8.000000        NaN  11.000000   9.000000   9.000000   
4    7.446256   7.000000   8.862947   6.405992   7.584963        NaN   
5         NaN  10.000000        NaN        NaN        NaN        NaN   
6         NaN   6.000000   



row: 1
row: 2
row: 3
row: 4
row: 5
row: 6
row: 7
row: 8
row: 9
row: 10
row: 11
row: 12
row: 13
row: 14
row: 15
row: 16
row: 17
row: 18
row: 19
row: 20
row: 21
row: 22
row: 23
row: 24
row: 25
row: 26
row: 27
row: 28
row: 29
row: 30
row: 31
row: 32
row: 33
row: 34
row: 35
row: 36
row: 37
row: 38
row: 39
row: 40
row: 41
row: 42
row: 43
row: 44
row: 45
row: 46
row: 47
row: 48
row: 49
row: 50
row: 51
row: 52
row: 53
row: 54
row: 55
row: 56
row: 57
row: 58
row: 59
row: 60
row: 61
row: 62
row: 63
row: 64
row: 65
row: 66
row: 67
row: 68
row: 69
row: 70
row: 71
row: 72
row: 73
row: 74
row: 75
row: 76
row: 77
row: 78


In [None]:
for i in range(1,4):
    # put data in pandas dataframe
    data = 'MissingData{}.txt'.format(i)
    X = pd.read_csv(data, sep='\s+', header=None)
    
    X = X.transpose()
    
    # header
    print('\n''*** MissingData{} ***'''.format(i))
    print('Number of Samples: ' + str(X.shape[0]))
    numberOfFeatures = X.shape[1]
    print('Number of Features: ' + str(X.shape[1]))
    print("\nOriginal Data: \n")

    # set uknowns in dataset to NaN
    X = X[X < 1.00000000000000e+99]
    print(str(X))
    
    Xresult = weightedKnnImpute(X, 4)
    
    Xresult = Xresult.transpose()
    
    np.savetxt("MissingResults{}.txt".format(i), Xresult.values, fmt='%f', delimiter="\t")
    


*** MissingData1 ***
Number of Samples: 14
Number of Features: 242

Original Data: 

     0     1     2     3     4     5     6     7     8     9    ...   232  \
0  -0.11 -0.30  0.50  0.00  0.40  0.39  0.50 -0.52 -0.87 -1.39  ...  0.19   
1   0.02 -0.37  0.18 -0.11 -0.16 -0.18 -0.24 -0.48 -0.17 -0.57  ... -0.27   
2  -0.36 -0.18  0.41   NaN  0.31 -0.09 -0.16 -0.73  0.10 -0.16  ... -0.43   
3  -0.11 -0.09   NaN  0.19 -0.34 -0.04   NaN -0.42 -0.12 -0.02  ... -0.12   
4   0.48 -0.16   NaN  0.00  0.02 -0.25  0.04 -0.40   NaN  0.30  ... -0.07   
5  -0.20 -0.16  0.15 -0.07  0.21 -0.69   NaN -0.01  0.52  0.41  ...  0.21   
6   0.27 -0.10 -0.25  0.18 -0.70 -0.64 -0.11 -0.34 -0.28  0.03  ... -0.01   
7   0.29 -0.09 -0.41  0.18  0.11 -0.38  0.10  0.55  0.50  0.50  ...  0.24   
8  -0.34  0.46 -0.07 -0.23  0.08  0.41 -0.04   NaN  0.09  0.48  ...  0.04   
9  -0.05  0.32 -0.13 -0.16  0.05  0.33 -0.01  0.37 -0.01  0.51  ...  0.01   
10  0.23  0.02 -0.15  0.03 -0.09 -0.62 -0.04 -0.29  0.08 -0.11  ...

row: 1
row: 2
row: 3
row: 4
row: 5
row: 6
row: 7
row: 8
row: 9
row: 10
row: 11
row: 12
row: 13
row: 14
row: 15
row: 16
row: 17
row: 18
row: 19
row: 20
row: 21
row: 22
row: 23
row: 24
row: 25
row: 26
row: 27
row: 28
row: 29
row: 30
row: 31


# 4. Time Series Classification & Prediction Deep Learning

In [None]:
import numpy as np
import pandas as pd 
from itertools import product
import gc 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC

Load the data and train/test split

In [None]:
sales_test = pd.read_csv('test.csv.gz')
sales = pd.read_csv('sales_train.csv.gz', compression= 'gzip')
items = pd.read_csv('items.csv')
item_cats = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')
sample_submission = pd.read_csv('sample_submission.csv.gz', compression = 'gzip')


In [None]:
# Remove outliers
sales = sales[sales['item_price'] < 100000]
sales = sales[sales['item_cnt_day'] < 1000]

In [None]:
index_col = ['date_block_num','shop_id','item_id']
sales = sales.groupby(index_col).agg({'item_cnt_day': np.sum, 'item_price': np.mean}).reset_index()
sales.rename({'item_cnt_day': 'item_cnt_month'}, axis='columns', inplace=True)

In [None]:
sales = pd.merge(sales, items, on='item_id',how='left')

In [None]:
sales = downcast_dtypes(sales)
sales = sales.drop(['item_name'], axis=1)

In [None]:
sales_test = pd.merge(sales_test, items, on='item_id',how='left')
sales_test = sales_test.drop(['item_name'], axis=1)
sales_test['date_block_num'] = 34
sales_test.head()

In [None]:
# Train/Test Split
train_df = sales.copy()
test_df = sales_test.copy()

In [None]:
svm = LinearSVC()
svm.fit(train_df, test_df.values.flatten())
print(svm.score(train_df, test_df))