In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import transformers as ppb
import nltk
import re
import torch.nn as nn 

In [None]:
!pip install openpyxl
data = pd.read_excel("../input/data-222/incident son 6 aylk.xlsx")
data = data[['Description','Caller','Business service','Incident state',
              'Impact','Urgency','Category','Subcategory','Assignment group']]


In [None]:
#txt preprocess
def utils_preprocess_text(text,flg_lemma = True , lst_stopwords=None):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    lst_text = text.split()    
    if lst_stopwords is not None: 
        lst_text = [word for word in lst_text if word not in ## stop_wordleri silme
                    lst_stopwords]
    

    text = " ".join(lst_text)
    return text
def textCleaner(df = None , src = 'content' ,dst = 'text_clean',stop_words = 'turkish'):
    #bütün dataframi temizlemek icin
    df[dst] = df[src].apply(lambda x: 
          utils_preprocess_text(x, 
          lst_stopwords=stop_words))
with open("../input/stop-words/turkish_stop_words.txt", 'r', encoding = 'utf-8') as fdict:
    tr_stp_wrds = []
    for line in fdict:
        
        word = line.strip().split()[0]
        tr_stp_wrds.append(word)

textCleaner(df = data,src = 'Description',stop_words = tr_stp_wrds)
#data_.drop("Description", inplace = True, axis = 1) 
#data_.reset_index(drop = True, inplace = True)

In [None]:
data['Assignment group'].unique().shape

In [None]:
def pre_process(dataset):
    
    for col in dataset.columns:
        unique_things = dataset[col].unique()
        #controlling if the datatype is str if it is we can convert it to int
        #print(type(col))
        if col != 'text_clean':
            if isinstance(unique_things[0],str):     
                limit = len(unique_things)
                for i in range(limit):
                    dataset[col] = dataset[col].replace({unique_things[i]:i})
                dataset[col] = dataset[col].astype(int)  
pre_process(data)
data = data.dropna()
x_txt,x_rest,y = np.array(data['text_clean']),np.array(data[['Caller' ,'Business service', 'Incident state', 'Impact' ,'Urgency' ,'Category' ,'Subcategory']]),np.array(data['Assignment group'])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pretrained = 'dbmdz/distilbert-base-turkish-cased'
tokenizer = ppb.DistilBertTokenizer.from_pretrained(pretrained)
vectorizer =ppb.DistilBertModel.from_pretrained(pretrained).to(device)

In [None]:
#sentence vectorizing process
vectors = []
rest_vect = []
classes = []
max_len = 512
for txt_,rest,target in tqdm(zip(x_txt,x_rest,y)):
    
    try:
        
        txt_encd = tokenizer.encode(txt_)
        padded_txt = torch.tensor(txt_encd + [0] * (max_len - len(txt_encd)))
    
        input_ids = padded_txt.clone().detach().type(torch.LongTensor).to(device)
        if not input_ids.shape[0] > 512:
            
            
            
            
            
            with torch.no_grad():  
                t_1 = torch.clamp(vectorizer(input_ids.unsqueeze(0) )[0][:, 0, :],0,1)
            vectors.append(t_1)
            rest_vect.append(torch.tensor(rest))
            classes.append(torch.tensor(target))

            
        
        
    except:
        print(txt_)
        raise Exception
    



In [None]:
#creating the datasets
class dataset():
    def __init__(self,sentence_vectors,other_vectors,classes):
        self.sentence_vectors = sentence_vectors
        self.other_vectors = other_vectors
        self.classes = classes
    def __getitem__(self, index: int):
        #return [list(padded_txt1),list(torch.tensor(self.others[index]).to(device)),]
        return [self.sentence_vectors[index],self.other_vectors[index],self.classes[index]]
        #return [self.sentences[index][0],self.sentences[index][1]],torch.tensor(self.classes[index])
    def __len__(self) -> int:
        return len(self.sentence_vectors)
    
#dataset_ = dataset(x_txt,x_rest,y)
dataset_ = dataset(vectors,rest_vect,classes)




In [None]:
concated = []
for x in tqdm(dataset_):
    concated.append([x[0],x[1]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split( concated ,dataset_[:][2],test_size=0.1, random_state=42)
vectors_train = [X_train[i][0] for i in range(len(X_train))]
other_vectors_train = [X_train[i][1] for i in range(len(X_train))]


vectors_test = [X_train[i][0] for i in range(len(X_test))]
other_vectors_test = [X_train[i][1] for i in range(len(X_test))]

data_train = dataset(vectors_train,other_vectors_train,y_train)
data_val = dataset(vectors_test,other_vectors_test,y_test) 

In [None]:
#creating the dataloaders
dataloader_train = torch.utils.data.DataLoader(data_train,
                                         batch_size = 1,
                                         shuffle = True,
                                         num_workers = 0)

dataloader_val = torch.utils.data.DataLoader(data_val,
                                         batch_size = 1,
                                         shuffle = True,
                                         num_workers = 0)

In [None]:
#basic classifier models
class classifier(nn.Module):
    def __init__(self):
        super(classifier,self).__init__()
        
        self.linear_0 = nn.Linear(768+7,500)
        self.linear_1 = nn.Linear(500,71)
        self.softmax = nn.Softmax()
        self.leakyrelu = nn.LeakyReLU(0.15)
        
        
        
    def forward(self,x): 
        #x = self.conv1d(x.unsqueeze(0).unsqueeze(0))
        #x = x.squeeze()

        x = self.linear_0(x)
        x = self.leakyrelu(x)
        x = self.linear_1(x)
        
        #x = self.softmax(x)

        return x  
classifier_model = classifier().to(device)



In [None]:
#training and validation phase
criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(classifier_model.parameters(),lr= 0.01,weight_decay = 0.001)
n_epochs = 100
valid_loss_min = np.Inf
max_len = 100
batch_size = 4
for epoch in range(1, n_epochs+1):
    print("Training Process...")
    train_loss = 0.0
    valid_loss = 0.0
    true_train = 0
    true_valid = 0
    times = 0
    train_total_loss = 0.0
    loss = 0
    #setting the classifier mode to train
    classifier_model.train()
    for vector,other_vector,target in tqdm(dataloader_train):
        
        
        try:
            
            vector = vector.to(device)
            other_vector = other_vector.to(device)
            #target = target.type(torch.LongTensor).to(device)
            target = target.to(dtype = torch.float32).to(device)
            #print(vector.squeeze(0).squeeze(0).shape,other_vector.squeeze(0).shape)
            
            model_input = torch.cat((vector.squeeze(0).squeeze(0),other_vector.squeeze(0)),0).to(device)
            output = classifier_model(model_input).to(device)
            #print(output)
            
            pred = torch.argmax(output).unsqueeze(0)
            if (pred == target):
                true_train += 1            


            print(output.shape,target.shape)
            loss += criterion(output,target)
            print(loss)
            asd
            train_loss += loss.item()
            times+=1
            if(times % batch_size == 0):
                #gradient accumulation phase
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                loss = 0
                train_total_loss+=train_loss
                train_loss = 0
            
            
            
        
        except KeyboardInterrupt:
            raise KeyboardInterrupt("Interrupt")
        except:
            raise Exception
            print("An Error Occurred in Train")
    #setting the classifier mode to evaluation        
    print("Validation Process...")
    classifier_model.eval()
    for vector,other_vector,target in tqdm(dataloader_val):
        try:
            vector = vector.to(device)
            other_vector = other_vector.to(device)
            target = target.type(torch.LongTensor).to(device)
            
            with torch.no_grad():
                model_input = torch.cat((vector.squeeze(0).squeeze(0),other_vector.squeeze(0)),0).to(device)
                output = classifier_model(model_input)
                
            pred = torch.argmax(output).unsqueeze(0)
            if (pred == target):
                true_valid += 1      
            loss = criterion(output.unsqueeze(0),target) 
            valid_loss += loss.item()

        except KeyboardInterrupt:
            raise KeyboardInterrupt("Interrupt")
        except:
            raise Exception
            print("An Error Occurred in Validation")
      
    print('Epoch: {} \tTraining Loss: {:.6f} \tTraining Accuracy: {:.6f} \tValidation Loss: {:.6f} \t Validation  Accuracy: {:.6f}'.format(
        epoch, train_total_loss,true_train/len(X_train), valid_loss,true_valid/len(X_test)))

In [None]:
#####This is for LSTM models

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
#import tensorflow_addons as tfa
#!pip install --upgrade tensorflow
#!pip install --upgrade tensorflow-gpu
from tensorflow import keras 
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Embedding, Input
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,GlobalAveragePooling1D,LeakyReLU
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import CategoricalCrossentropy

In [None]:
!pip install openpyxl
data = pd.read_excel("../input/data-222/incident son 6 aylk.xlsx")
data = data[['Description','Caller','Business service','Incident state',
              'Impact','Urgency','Category','Subcategory','Assignment group']]

In [None]:
#txt preprocess
def utils_preprocess_text(text,flg_lemma = True , lst_stopwords=None):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    lst_text = text.split()    
    if lst_stopwords is not None: 
        lst_text = [word for word in lst_text if word not in ## stop_wordleri silme
                    lst_stopwords]
    

    text = " ".join(lst_text)
    return text
def textCleaner(df = None , src = 'content' ,dst = 'text_clean',stop_words = 'turkish'):
    #bütün dataframi temizlemek icin
    df[dst] = df[src].apply(lambda x: 
          utils_preprocess_text(x, 
          lst_stopwords=stop_words))
with open("../input/stop-words/turkish_stop_words.txt", 'r', encoding = 'utf-8') as fdict:
    tr_stp_wrds = []
    for line in fdict:
        
        word = line.strip().split()[0]
        tr_stp_wrds.append(word)

textCleaner(df = data,src = 'Description',stop_words = tr_stp_wrds)
#data_.drop("Description", inplace = True, axis = 1) 
#data_.reset_index(drop = True, inplace = True)

In [None]:
def pre_process(dataset):
    
    for col in dataset.columns:
        unique_things = dataset[col].unique()
        #controlling if the datatype is str if it is we can convert it to int
        #print(type(col))
        if col != 'text_clean':
            if isinstance(unique_things[0],str):     
                limit = len(unique_things)
                for i in range(limit):
                    dataset[col] = dataset[col].replace({unique_things[i]:i})
                dataset[col] = dataset[col].astype(int)  
pre_process(data)
data = data.dropna()
x_txt,x_rest,y = np.array(data['text_clean']),np.array(data[['Caller' ,'Business service', 'Incident state', 'Impact' ,'Urgency' ,'Category' ,'Subcategory']]),np.array(data['Assignment group'])

In [None]:
max_features = 15000
maxlen = 100
tokenizer = text.Tokenizer(num_words=max_features)

In [None]:
#X, y = tags_[0]['text_clean'],tags_[0]['Main Category']

X_train, X_test, y_train, y_test = train_test_split( x_txt, y, test_size=0.33, random_state=42)
tokenizer.fit_on_texts(list(X_train))#tokenizerın sadece train datasını görüp tokenleri de train datasına 
#göre çıkarmasını sağlıyor

#kelimeler tokenlere dönüşüyor
x_train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=maxlen,padding='post') 
x_test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=maxlen,padding='post')

In [None]:
def base_model():
    sgd = SGD(learning_rate = 0.05,momentum = 0.6)
    embed_size = 32
    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(100, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(100,activation = LeakyReLU())(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy',
                    optimizer=sgd,
                    metrics=['accuracy'])
    return model
model = base_model()

In [None]:
model.fit(x_train_,y_train,validation_data = (x_test_,y_test),batch_size = 4,epochs = 10)