In [None]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer as LabelBinarize
import _pickle as pickle
from sklearn import model_selection
from sklearn import linear_model
import gc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn import metrics 
from sklearn import preprocessing

**Download Dataset**

In [None]:
import requests
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def download_file(url):
    r = requests.get(url, stream = True)
    local_filename = url.split('/')[-1]
    with open(os.path.join("/content/drive/My Drive",local_filename), "wb") as file:
      for block in r.iter_content(chunk_size = 1024):
        if block:  
          file.write(block)

In [None]:
download_file("http://opendata.deepsig.io/datasets/2016.10/RML2016.10b.tar.bz2")
!tar -xf  'drive/My Drive/AssignFour/RML2016.10b.tar.bz2'

In [None]:
file_path = 'drive/My Drive/AssignFour/RML2016.10b.dat'

In [None]:
!ls 'drive/My Drive/AssignFour'

comb_decision_tree.sav	raw_logistic.sav  RML2016.10b.tar.bz2
comb_logistic.sav	RML2016.10b.dat


**Read Dataset**

In [None]:
openedFile = open(file_path,'rb')
data = pickle.load(openedFile, encoding='latin1')
print(data[('QPSK', 2)].shape)
print(data[('PAM4', 8)].shape)
keys_list = list(data.keys())
temp_data = []
label_data = []

for i in range(len(keys_list)):
    curr_item = data[keys_list[i]] 
    temp_data.append(curr_item)
    for j in range(curr_item.shape[0]):
        label_data.append(keys_list[i])
        

data = np.array(temp_data).reshape(1200000,2,128)

labels = np.array(label_data)

print(data.shape)

print(labels.shape)


(6000, 2, 128)
(6000, 2, 128)
(1200000, 2, 128)
(1200000, 2)


# **Classifiers**

**Logistic Regression Classifier**

In [None]:
def logistic_regression(train_x, train_l, validate_x, validate_l, test_x, test_l, filename):
  lm = linear_model.LogisticRegression(max_iter=500)
  lm.fit(train_x, np.argmax(train_l, axis=1))
  print("Validation of Logistic Regression: ")
  print(lm.score(validate_x, np.argmax(validate_l, axis=1)))
  print("Test of Logisitc Regression")
  print(lm.score(test_x, np.argmax(test_l, axis=1)))
  filename += "logistic.sav"
  filename_with_path = 'drive/My Drive/AssignFour/' + filename
  pickle.dump(lm, open(filename_with_path, 'wb'))


**Decision Tree**

In [None]:
def decision_tree(train_x, train_l, validate_x, validate_l, test_x, test_l, filename):
  classifier = DecisionTreeClassifier(max_depth=10)
  classifier.fit(train_x, np.argmax(train_l, axis=1))
  print("Validation of Decision Tree: ")
  print(classifier.score(validate_x, np.argmax(validate_l, axis=1)))
  print("Test of Decision Tree")
  print(classifier.score(test_x, np.argmax(test_l, axis=1)))
  filename += "decision_tree.sav"
  filename_with_path = 'drive/My Drive/AssignFour/' + filename
  pickle.dump(classifier, open(filename_with_path, 'wb'))


**Random Forest**

In [None]:
def randomforest(train_x, train_l, validate_x, validate_l, test_x, test_l, filename):
  forest_model = RandomForestClassifier(random_state=0, max_depth=5)
  forest_model.fit(train_x, np.argmax(train_l, axis=1))
  print("Validation of Random Forest: ")
  p = forest_model.score(validate_x, np.argmax(validate_l, axis=1))
  print(p)
  print("Test of Random Forest: ")
  p = forest_model.score(test_x, np.argmax(test_l, axis=1))
  print(p)
  filename += "randomforst.sav"
  filename_with_path = 'drive/My Drive/AssignFour/' + filename
  pickle.dump(forest_model, open(filename_with_path, 'wb'))

# **Feature Spaces**

**Raw time series as given (two channels)**

In [None]:
raw_feature = data.copy()
raw_feature = raw_feature.reshape(1200000,-1)
scaler = preprocessing.StandardScaler().fit(raw_feature)
raw_feature_norm = scaler.transform(raw_feature)
X_train, X_test, y_train, y_test = model_selection.train_test_split(raw_feature_norm, labels, test_size= 0.3, random_state = 42)
X_train_final, X_validate, y_train_final, y_validate = model_selection.train_test_split(X_train, y_train, test_size= 0.05, random_state = 42)

lb = LabelBinarize()
lb.fit_transform(labels[:,0])
y_train_final = lb.transform(y_train_final[:,0])
y_validate = lb.transform(y_validate[:,0])
y_test = lb.transform(y_test[:,0])

del(raw_feature)
del(raw_feature_norm)
gc.collect()
logistic_regression(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "raw_")
decision_tree(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "raw_")
randomforest(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "raw_")

Validation of Logistic Regression: 
0.16247619047619047
Test of Logisitc Regression
0.16101944444444444
Validation of Decision Tree: 
0.2517857142857143
Test of Decision Tree
0.25361666666666666
Validation of Random Forest: 
0.28095238095238095
Test of Random Forest: 
0.280575


**First derivative in time (two channels)**

In [None]:
data_for_derivative = data.copy()
der_data = []
for d in data_for_derivative:
  d1 = np.gradient(d[0])
  d2 = np.gradient(d[1])
  d1.reshape(1,-1)
  d2.reshape(1,-1)
  d3 = np.array([d1 , d2]).reshape(256)
  der_data.append(d3)

der_data = np.array(der_data)
scaler = preprocessing.StandardScaler().fit(der_data)
der_data_norm = scaler.transform(der_data)

X_train, X_test, y_train, y_test = model_selection.train_test_split(der_data_norm, labels, test_size= 0.3, random_state = 42)
X_train_final, X_validate, y_train_final, y_validate = model_selection.train_test_split(X_train, y_train, test_size= 0.05, random_state = 42)

lb = LabelBinarize()
lb.fit_transform(labels[:,0])
y_train_final = lb.transform(y_train_final[:,0])
y_validate = lb.transform(y_validate[:,0])
y_test = lb.transform(y_test[:,0])

del(der_data)
del(der_data_norm)
del(data_for_derivative)
gc.collect()
logistic_regression(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "der_")
decision_tree(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "der_")
randomforest(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "der_")


Validation of Logistic Regression: 
0.11547619047619048
Test of Logisitc Regression
0.11530833333333333
Validation of Decision Tree: 
0.21492857142857144
Test of Decision Tree
0.21413888888888888
Validation of Random Forest: 
0.19502380952380952
Test of Random Forest: 
0.19260833333333333




**Integral in time (two channels)**

In [None]:
data_for_integral = data.copy()
int_data = []
for d in data_for_integral:
  d1 = np.cumsum(d[0])
  d2 = np.cumsum(d[1])
  d1.reshape(1,-1)
  d2.reshape(1,-1)
  d3 = np.array([d1 , d2]).reshape(256)
  int_data.append(d3)

int_data = np.array(int_data)
scaler = preprocessing.StandardScaler().fit(int_data)
int_data_norm = scaler.transform(int_data)

X_train, X_test, y_train, y_test = model_selection.train_test_split(int_data_norm, labels, test_size= 0.3, random_state = 42)
X_train_final, X_validate, y_train_final, y_validate = model_selection.train_test_split(X_train, y_train, test_size= 0.05, random_state = 42)

lb = LabelBinarize()
lb.fit_transform(labels[:,0])
y_train_final = lb.transform(y_train_final[:,0])
y_validate = lb.transform(y_validate[:,0])
y_test = lb.transform(y_test[:,0])

del(int_data)
del(int_data_norm)
del(data_for_integral)
gc.collect()
logistic_regression(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "integ_")
decision_tree(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "integ_")
randomforest(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "integ_")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Validation of Logistic Regression: 
0.16192857142857142
Test of Logisitc Regression
0.16137222222222222
Validation of Decision Tree: 
0.24183333333333334
Test of Decision Tree
0.2420138888888889
Validation of Random Forest: 
0.17885714285714285
Test of Random Forest: 
0.17941666666666667


**combinations of 1,2 and 3. (More channels)**

In [None]:
data_for_comb = data.copy()
comb_data = []
for d in data_for_comb:
  d1 = np.cumsum(d[0])
  d2 = np.cumsum(d[1])
  d1.reshape(1,-1)
  d2.reshape(1,-1)
  d4 = np.gradient(d[0])
  d5 = np.gradient(d[1])
  d4.reshape(1,-1)
  d5.reshape(1,-1)
  d3 = np.array([d[0], d[1], d1 , d2, d4, d5]).reshape(768)
  comb_data.append(d3)

comb_data = np.array(comb_data)
scaler = preprocessing.StandardScaler().fit(comb_data)
comb_data_norm = scaler.transform(comb_data)

X_train, X_test, y_train, y_test = model_selection.train_test_split(comb_data_norm, labels, test_size= 0.3, random_state = 42)
X_train_final, X_validate, y_train_final, y_validate = model_selection.train_test_split(X_train, y_train, test_size= 0.05, random_state = 42)

lb = LabelBinarize()
lb.fit_transform(labels[:,0])
y_train_final = lb.transform(y_train_final[:,0])
y_validate = lb.transform(y_validate[:,0])
y_test = lb.transform(y_test[:,0])

del(comb_data)
del(comb_data_norm)
del(data_for_comb)
gc.collect()
logistic_regression(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "comb_")
decision_tree(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "comb_")
randomforest(X_train_final, y_train_final, X_validate, y_validate, X_test, y_test, "comb_")

Validation of Logistic Regression: 
0.16230952380952382
Test of Logisitc Regression
0.16098055555555554
Validation of Decision Tree: 
0.24114285714285713
Test of Decision Tree
0.24228333333333332
Validation of Random Forest: 
0.22235714285714286
Test of Random Forest: 
0.22502777777777777
