In [53]:
import os
import numpy as np
import pandas as pd
import torch
import torchvision
import seaborn as sns

In [54]:
from sklearn.metrics import roc_auc_score

In [132]:
# root_dir = '../input/Kannada-MNIST/'
# test_dir = 'test.csv'
# train_dir = 'train.csv'
root_dir = 'D:\\filesForNumpyAndPandas\\SoftwareDeffects\\'
test_dir = 'test.csv'
train_dir = 'train.csv'

In [133]:
data = pd.read_csv(root_dir+train_dir)

In [134]:
print((data.corr().defects))

id                   0.001976
loc                  0.342642
v(g)                 0.301187
ev(g)                0.259928
iv(g)                0.245618
n                    0.258080
v                    0.231179
l                   -0.253237
d                    0.241936
i                    0.208577
e                    0.095366
b                    0.232594
t                    0.099592
lOCode               0.250604
lOComment            0.205402
lOBlank              0.257819
locCodeAndComment    0.133150
uniq_Op              0.178474
uniq_Opnd            0.246113
total_Op             0.250533
total_Opnd           0.252752
branchCount          0.322827
defects              1.000000
Name: defects, dtype: float64


In [135]:
y_train = data['defects'].astype(int)
X_train = data.drop(['defects','id','e','t'],axis=1)
print(X_train.shape)

(101763, 19)


In [136]:
def mean_0std_1(tensor):
    return (tensor - torch.mean(tensor, dim=0)) / torch.std(tensor, dim=0)

In [137]:
import random
from random import randint
class SMOTE(object):
    def __init__(self,distance='euclidian',dims=19,k=5):
        super(SMOTE,self).__init__()
        self.newindex = 0 
        self.k = k
        self.dims = dims
        self.distance_measure = distance
        
    def populate(self, N,i,nnarray,min_samples,k):
        while N:
            nn = randint(0, k-2)
            
            diff = min_samples[nnarray[nn]] - min_samples[i]
            gap = random.uniform(0,1)

            self.synthetic_arr[self.newindex,:] = min_samples[i] + gap * diff
            
            self.newindex += 1
            
            N -= 1
    def k_neighbors(self, euclid_distance, k):
        nearest_idx = torch.zeros((euclid_distance.shape[0],euclid_distance.shape[0]), dtype = torch.int64)
        #print(euclid_distance.shape)
        #return
        idxs = torch.argsort(euclid_distance, dim=1)
        nearest_idx[:,:] = idxs
        
        return nearest_idx[:,1:k]
    
    def find_k(self,X,k):
        euclid_distance = torch.zeros((X.shape[0],X.shape[0]), dtype = torch.float32)
        
        for i in range(len(X)):
            dif = (X - X[i])**2
            dist = torch.sqrt(dif.sum(axis=1))
            euclid_distance[i] = dist
            
        return self.k_neighbors(euclid_distance,k)
    
    def generate(self, min_samples, N,k):
        """
            Returns (N/100) * n_minority_samples synthetic minority samples.
    		Parameters
    		----------
    		min_samples : Numpy_array-like, shape = [n_minority_samples, n_features]
    		    Holds the minority samples
    		N : percetange of new synthetic samples: 
    		    n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
    		k : int. Number of nearest neighbours. 
    		Returns
    		-------
    		S : Synthetic samples. array, 
    		    shape = [(N/100) * n_minority_samples, n_features]. 
    	"""
        T = min_samples.shape[0]
        self.synthetic_arr = torch.zeros(int(N/100)*T,self.dims)
        N = int(N/100)
        if self.distance_measure == 'euclidian':
            indices = self.find_k(min_samples,k)
        for i in range(indices.shape[0]):
            self.populate(N, i, indices[i], min_samples, k)
        self.newindex = 0 
        return self.synthetic_arr
            
    def fit_generate(self,X,y):
        #get occurence of each class
        occ = torch.eye(int(y.max()+1),int(y.max()+1))[y].sum(axis=0)

        #get the dominant class
        dominant_class = torch.argmax(occ)
        #get occurence of the dominant class
        n_occ = int(occ[dominant_class].item())
        for i in range(len(occ)):
            if i != dominant_class:
                #calculate the amount of synthetic data to generate
                N = (n_occ - occ[i]) * 100 / occ[i]
                candidates = X[y == i]
                xs = self.generate(candidates, N,self.k)
                #print(xs.shape)
                X = torch.cat((X,xs))
                ys = torch.ones(xs.shape[0]) * i
                #print(ys.shape)
                y = torch.cat((y,ys))
        return X,y
                  

In [138]:
n = len(X_train)
val_part = 5
idx = np.random.permutation(n)
X_train = torch.Tensor(X_train.to_numpy())[idx]
y_train = torch.Tensor(y_train.to_numpy()).long()[idx]
X_train = mean_0std_1(X_train)

X_val = X_train[0:int(n/val_part)]
y_val = y_train[0:int(n/val_part)]
X_train = X_train[int(n/val_part):]
y_train = y_train[int(n/val_part):]

In [139]:
print(X_val.shape)
print(X_train.shape)

torch.Size([20352, 19])
torch.Size([81411, 19])


In [140]:



sm = SMOTE()
new_X_train = torch.zeros((0,19))
new_y_train = torch.zeros(0)
#print(X_train.shape,y_train.shape)
batch_size=100
for i in range(int(X_train.shape[0] / batch_size)):
    X_batch = X_train[i*batch_size:(i+1)*batch_size]
    y_batch = y_train[i*batch_size:(i+1)*batch_size]
    X,y = sm.fit_generate(X_batch,y_batch)
    new_X_train = torch.cat((new_X_train, X))
    new_y_train = torch.cat((new_y_train, y))

X_train = new_X_train
y_train = new_y_train





In [141]:
print(X_val.shape)
print(X_train.shape)


print(sum(y_val==1) / len(y_val))
print(sum(y_train==1)/ len(y_train))

torch.Size([20352, 19])
torch.Size([118425, 19])
tensor(0.2291)
tensor(0.4680)


In [142]:
class FullyConnectedNetwork(torch.nn.Module):
    def __init__(self, hidden_layers_1, hidden_layers_2,hidden_layers_3, hidden_layers_4,\
                 hidden_layers_5,hidden_layers_6,hidden_layers_7,hidden_layers_8):
        super().__init__()
        self.fc1 = torch.nn.Linear(19, hidden_layers_1)
        self.fc2 = torch.nn.Linear(hidden_layers_1, hidden_layers_2)
        self.fc3 = torch.nn.Linear(hidden_layers_2, hidden_layers_3)
        self.fc4 = torch.nn.Linear(hidden_layers_3, hidden_layers_4)
        self.fc5 = torch.nn.Linear(hidden_layers_4, 1)
        self.fc6 = torch.nn.Linear(hidden_layers_5, hidden_layers_6)
        self.fc7 = torch.nn.Linear(hidden_layers_6, hidden_layers_7)
        self.fc8 = torch.nn.Linear(hidden_layers_7, hidden_layers_8)
        self.fc9 = torch.nn.Linear(hidden_layers_8, 1)
        
        self.dropout = torch.nn.Dropout(p=0.2)
        
        self.act1 = torch.nn.LeakyReLU()
        
        self.act2 = torch.nn.LeakyReLU()
        self.act3 =  torch.nn.LeakyReLU()
        self.act4 =  torch.nn.LeakyReLU()
        self.act5 = torch.nn.Sigmoid()
        self.act6 = torch.nn.ReLU()
        self.act7 = torch.nn.ReLU()
        self.act8 = torch.nn.ReLU()
        self.act9 = torch.nn.Sigmoid()
        #normaliztion????
    
    def forward(self, x):
        #x = self.dropout(x)
       
        x = self.fc1(x)
        x = self.act1(x)
        
        x = self.dropout(x)
        
        #x = mean_0std_1(x)
        x = self.fc2(x)
        x = self.act2(x)
        
        x = self.dropout(x)
        #x = mean_0std_1(x)
        x = self.fc3(x) 
        x = self.act3(x)
        
        x = self.dropout(x)
       
        #x = mean_0std_1(x)
        x = self.fc4(x)
        x = self.act4(x)
        x = self.dropout(x)
        
        x = self.fc5(x)
        x = self.act5(x)
        
        
        return x

In [143]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [148]:

def train(net, X_train, y_train,X_val,y_val, optimizer, loss_func, epochs, batch_size):
    
    for epoch in range(epochs):
        
        for stage in ['train','val']:
            avg_auc = 0
            train_acc = 0
            loss = 0
            n = len(X_train) if stage == 'train' else len(X_val)
            idx = np.random.permutation(n)
            if stage == 'train':
                X_train = X_train[idx]
                y_train = y_train[idx]
            if stage == 'val':
                X_val = X_val[idx]
                y_val = y_val[idx]
            for batch in range(int(n/batch_size)):
                #print(n)
                if(stage == 'train'):
                    
                    X_batch = X_train[batch*batch_size:(batch+1)*batch_size - 1].to(device)
                    y_batch = y_train[batch*batch_size:(batch+1)*batch_size - 1].to(device)
                else:
                    X_batch = X_val[batch*batch_size:(batch+1)*batch_size - 1].to(device)
                    y_batch = y_val[batch*batch_size:(batch+1)*batch_size - 1].to(device)

                
                y_pred = net.forward(X_batch)
                #print(y_pred)
                with torch.set_grad_enabled(stage == 'train'):
                   
                    loss_value = loss_func(y_pred.squeeze().float(),y_batch.float())
                    
                    loss+=loss_value.item()
                    
                if(stage == 'train'):
                    loss_value.backward()
                    optimizer.step()
                y_pred = y_pred.squeeze()
                y_pred[y_pred < 0.5] = 0
                y_pred[y_pred > 0.5] = 1
                
                train_acc += (y_pred==y_batch).float().mean()
                #print(y_pred.squeeze())
                
                avg_auc+= (roc_auc_score(y_batch.cpu().detach().numpy() < 0.5,\
                                    y_pred.squeeze().cpu().detach().numpy() < 0.5))
                
            
            #print(torch.max(net.fc4.weight.grad),torch.min(net.fc4.weight.grad) )
            print('epoch: {},stage: {}, accuracy: {}, loss: {}, AUC: {}'.format(epoch,stage,train_acc/int(n/batch_size),\
                                                                              loss/int(n/batch_size),avg_auc/int(n/batch_size)))
           

In [149]:
import random

net = FullyConnectedNetwork(128,32,16,8,0,0,0,0).to(device)
optimizer = torch.optim.Adam(net.parameters(),lr=0.0001,weight_decay=0.0001)
loss_func = torch.nn.BCELoss()
#torch.manual_seed(0)
#random.seed(0)
epochs = 300
batch_size = 128




In [150]:
def print_grad(grad):
    pass
    #print(grad)

In [151]:
train(net, X_train, y_train,X_val,y_val, optimizer, loss_func, epochs, batch_size)

epoch: 0,stage: train, accuracy: 0.6838394999504089, loss: 0.6260727727413178, AUC: 0.6793421945446371
epoch: 0,stage: val, accuracy: 0.7334224581718445, loss: 0.5543812908841379, AUC: 0.7197875864781648
epoch: 1,stage: train, accuracy: 0.7140340805053711, loss: 0.5923071244922844, AUC: 0.7115978401206574
epoch: 1,stage: val, accuracy: 0.7360967397689819, loss: 0.5457684176522981, AUC: 0.7167096698218535
epoch: 2,stage: train, accuracy: 0.7177709341049194, loss: 0.5840436463420455, AUC: 0.7133946815460647
epoch: 2,stage: val, accuracy: 0.7599661946296692, loss: 0.5883565788748879, AUC: 0.7144726702050787
epoch: 3,stage: train, accuracy: 0.7156515717506409, loss: 0.5754902472689345, AUC: 0.7094624044874682
epoch: 3,stage: val, accuracy: 0.7823993563652039, loss: 0.5591067283783319, AUC: 0.7026840744815891
epoch: 4,stage: train, accuracy: 0.7170050144195557, loss: 0.5735340168347229, AUC: 0.7087312731843793
epoch: 4,stage: val, accuracy: 0.7783387303352356, loss: 0.534023384628056, AUC: 

KeyboardInterrupt: 

In [152]:
data_test = pd.read_csv(root_dir+test_dir)
print(data_test)

           id    loc  v(g)  ev(g)  iv(g)      n        v     l      d      i  \
0      101763   33.0   5.0    1.0    4.0  144.0   824.82  0.04  26.96  30.05   
1      101764   27.0   8.0    8.0    2.0  125.0   646.24  0.04  22.82  27.22   
2      101765  130.0  11.0    7.0   10.0  545.0  3831.40  0.02  48.15  66.17   
3      101766   65.0   7.0    1.0    7.0  156.0   855.71  0.06  17.23  49.89   
4      101767   22.0   3.0    1.0    3.0   52.0   238.42  0.10   9.60  26.70   
...       ...    ...   ...    ...    ...    ...      ...   ...    ...    ...   
67837  169600   41.0   1.0    1.0    1.0  125.0   656.55  0.07  14.00  47.61   
67838  169601   20.0   3.0    1.0    3.0   38.0   161.42  0.15   6.75  23.28   
67839  169602   24.0   2.0    1.0    2.0   52.0   240.00  0.11   9.00  26.67   
67840  169603   18.0   2.0    1.0    1.0   49.0   216.64  0.11   9.33  24.02   
67841  169604  101.0  15.0   10.0    5.0    0.0     0.00  0.00   0.00   0.00   

       ...        t  lOCode  lOComment 

In [153]:
net.cpu()
y_pred = net.forward(mean_0std_1(torch.Tensor(data_test.drop(['e','t'],axis=1).values))[:,1:])


In [154]:
l=[]
for el in y_pred:
    #print(el)
    l.append(el.item())

df = pd.DataFrame({'id':data_test['id'],'label':l})

In [155]:
print(df)

           id     label
0      101763  0.500863
1      101764  0.319206
2      101765  0.964017
3      101766  0.882941
4      101767  0.376878
...       ...       ...
67837  169600  0.459466
67838  169601  0.360111
67839  169602  0.382065
67840  169603  0.145151
67841  169604  0.945185

[67842 rows x 2 columns]


In [156]:
df.to_csv(root_dir+'submission.csv', index = False)

In [None]:
ten = torch.Tensor([[1,-1,1],
                   [1,0,0],
                   [3,3,2]])

In [None]:
ten_n = torch.mean(ten,dim=0)

In [None]:
print(ten_n)

In [None]:
ten = (ten - ten_n) /torch.std(ten, dim=0)

In [None]:
print(ten)