Preprocessing data for Kmeans, GMM model
--> return: vectorized data with 300 features

In [None]:
!pip show sklearn

In [None]:
import pandas as pd

import spacy
import spacy.cli
from scipy import spatial
spacy.cli.download("en_core_web_md")
nlp = spacy.load('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp drive/MyDrive/ActionLearning/Preprocessed_Data.csv .
!cp drive/MyDrive/ActionLearning/Stop_words.csv .

**Loading Data**

In [None]:
df_master = pd.read_csv('Preprocessed_Data.csv')

In [None]:
df_master.head()

Unnamed: 0,Number,Priority,Incident_area,Opened,Closed,Status,Problem,Configuration_item,Short_description,Description,Assignment_group,Resolution_notes,Reassignment_count,Resolve_time
0,INC0284448,P2 - High,SAP :: Sonda Procwork (PRW),44013.002951,44024.875023,Closed,,,VF - Not working,"Please, the transaction VF is not working, we ...",IMT APS SAP Order to Cash L2,"Hello Daniel,Many thanks for your reply, at ti...",1.0,158300.0
1,INC0284449,P4 - Low,SAP :: Technical,44013.003183,44023.083368,Closed,,SAP-ECC-PRD,Enqueue Processing,Alert Details: Name....................Enqueue...,IMT TCS SAP Basis,Monitoring the lock entries in P via incident ...,0.0,5694.0
2,INC0180368,P2 - High,Infrastructure & Servers :: Server (physical/v...,43831.011447,43842.500012,Closed,,GOYCSM0001DC,GOYCSMDC.ds.givaudan.com (Server does not comm...,"Dear Team,We received a ticket from our monito...",IMT NLI Ops EAME,There was a planned maintenance onsite that wa...,1.0,126566.0
3,INC0180373,P3 - Moderate,SAP :: Order to Cash (OTC),43831.132951,43871.625023,Closed,,,I cant connect to ESKER PRD & UAT,"I already got access for ESKER QA, UAT & PRD. ...",IMT Desktop Infra Support Indonesia,User confirmed issue was solved,3.0,1919058.0
4,INC0180380,P3 - Moderate,Business Application :: Google,43831.405255,43841.416678,Closed,,,I cannot access my google drive shortcut from ...,I cannot access my google drive shortcut from ...,IMT Service Desk AME,"User G drive was inactive, i reconfigured it, ...",0.0,128.0


*Remove null data on Description*

In [None]:
columns = ['Number', 'Priority', 'Incident_area', 
           'Short_description', 'Description', 'Assignment_group', 'Resolution_notes']
df_notnull = df_master[columns]
tags = df_notnull['Description'].notnull()
df_notnull = df_notnull[tags]
df_notnull.shape

(103346, 7)

In [None]:
df_notnull.to_csv("preprocessed_data.csv", index = False)

**NLP Processing**

In [None]:
df = df_master.copy()

In [None]:
columns = 'Description'
df = df_master[columns]

*Tokenization*

In [None]:
def tokenizer(row):
    doc = nlp(str(row))
    return [token for token in doc]

*Lemmatization*

In [None]:
def lemmatization(row):
    doc = nlp(str(row))
    return [token.lemma_ for token in doc]

*Stop word Removal*

In [None]:
stop_words = pd.read_csv("Stop_words.csv")
print(stop_words.shape)
stop_words.head()

(1414, 2)


Unnamed: 0,Word,Length
0,–,1.0
1,——,2.0
2,#,1.0
3,,3.0
4,$,1.0


In [None]:
for word in stop_words['Word']:
    nlp.Defaults.stop_words.add(word)

print(len(nlp.Defaults.stop_words))

1542


In [None]:
def clean(row):
    tokens = tokenizer(str(row))
    return [
            token.lemma_.lower() for token in tokens if 
            (not token.is_punct) 
            and (token.text not in nlp.Defaults.stop_words) 
            and (token.text.strip() != '')
    ]

In [None]:
def vector(row):
    text = clean(str(row))
    return list(nlp(' '.join(text)).vector)

In [None]:
print(vector(data[0])) (shape = (300, ))

[-0.054270532, 0.13069189, -0.1375125, 0.03478963, 0.09665124, -0.0009659955, -0.1431185, -0.22028999, 0.024922501, 1.1460999, -0.16021125, 0.22867088, -0.010778, -0.13176799, 0.075039, 0.024886629, -0.03982125, 1.0875375, 0.1410275, 0.037318006, 0.05519975, 0.03424879, -0.12205363, -0.11233962, -0.0911505, -0.07442762, 0.040034864, -0.079109006, 0.012973875, 0.1082005, 0.014538659, -0.20315562, 0.042588625, 0.108771, -0.07984034, 0.114575624, 0.19470862, -0.08985924, 0.06027937, -0.17035814, -0.12467025, 0.12252262, 0.14420363, -0.1322075, -0.08058661, 0.03703526, 0.09712362, -0.03885313, -0.0376765, 0.11634625, 0.14116625, -0.0396355, 0.14236775, 0.17513324, 0.20370251, -0.12630437, -0.076155625, -0.04418875, -0.119681254, -0.07446125, -0.12471, -0.06121537, -0.0823725, 0.129415, 0.17095426, -0.08587312, 0.019422999, 0.19854012, 0.00053950027, 0.05547288, 0.2178525, 0.08024613, 0.15558138, 0.039725628, 0.21906537, 0.019284623, -0.030990746, -0.24792776, 0.037652124, 0.103869714, -0.0

In [None]:
embedded_data = data.apply(vector)

In [None]:
final_data = [data for data in embedded_data]

In [None]:
import numpy as np
final_data = np.array(final_data)
final_data.shape

(103346, 300)

Export Data

In [None]:
np.savetxt("Final_Data.csv", final_data, delimiter=",")