In [1]:
import config as m_config
from model import get_model
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import hstack
import sklearn
import sys
from utils import TensorLogger
import joblib

sys.path.append('.')

In [2]:
version = 1
version_descr = """
txt combined
"""
config = m_config.get_default_config(version)

In [3]:
scaler = MinMaxScaler()

data_path = str(Path('.') / config.data_folder / config.train_data_file)
dtypes = {'change_auswirkung': str}
df = pd.read_csv(data_path, dtype=dtypes)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222426 entries, 0 to 222425
Data columns (total 16 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   change_nr                  222426 non-null  object 
 1   dauer_arbtlog              222426 non-null  float64
 2   vorlauf_htransf            222426 non-null  float64
 3   mgrsite_bw_bund            222426 non-null  bool   
 4   mgrsite_bwi_it             222426 non-null  bool   
 5   eigenbetrieb               222426 non-null  bool   
 6   arb_pl_aufgabe_wfaktionen  171567 non-null  object 
 7   arb_pl_change_auftrag      222426 non-null  bool   
 8   arb_pl_typ_wartung         222426 non-null  bool   
 9   arb_pl_verwendet           222426 non-null  bool   
 10  ci_prioritaet              222426 non-null  float64
 11  ci_in_betrieb              222426 non-null  bool   
 12  ci_klass_loc               222275 non-null  object 
 13  kurztext_svc_klass         22

In [6]:
df['change_ausw_num'] = df['change_auswirkung']

In [7]:
df.loc[df['change_ausw_num'] == "1|2", 'change_ausw_num'] = 2
df['change_ausw_num'] = pd.to_numeric(df['change_ausw_num'], errors='coerce')
df['change_ausw_num'] = scaler.fit_transform(df[['change_ausw_num']])

In [8]:
df.change_ausw_num.value_counts()#.plot(kind="bar")

change_ausw_num
0.666667    144780
0.333333     42272
0.000000     28849
1.000000      6525
Name: count, dtype: int64

In [9]:
model = get_model()

In [10]:
df['details_preproc3'].fillna('', inplace=True)

In [11]:
combined_text_cols = ['kurztext_svc_klass', 'details_preproc3', 'ci_klass_loc', 'arb_pl_aufgabe_wfaktionen']
df['combined_text'] = df[combined_text_cols].apply(lambda row: ' '.join(val for val in row if pd.notnull(val)), axis=1)

In [12]:
df[combined_text_cols].sample().transpose()

Unnamed: 0,21033
kurztext_svc_klass,wan plus tgw vorabinstallation hardwareabruf r...
details_preproc3,techniker nokia abgestimmt termin anmelden
ci_klass_loc,router
arb_pl_aufgabe_wfaktionen,umsetzungsaufgaben


In [13]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['change_auswirkung'])

In [14]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44486 entries, 77612 to 29544
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   change_nr                  44486 non-null  object 
 1   dauer_arbtlog              44486 non-null  float64
 2   vorlauf_htransf            44486 non-null  float64
 3   mgrsite_bw_bund            44486 non-null  bool   
 4   mgrsite_bwi_it             44486 non-null  bool   
 5   eigenbetrieb               44486 non-null  bool   
 6   arb_pl_aufgabe_wfaktionen  34371 non-null  object 
 7   arb_pl_change_auftrag      44486 non-null  bool   
 8   arb_pl_typ_wartung         44486 non-null  bool   
 9   arb_pl_verwendet           44486 non-null  bool   
 10  ci_prioritaet              44486 non-null  float64
 11  ci_in_betrieb              44486 non-null  bool   
 12  ci_klass_loc               44454 non-null  object 
 13  kurztext_svc_klass         44486 non-null  obje

In [15]:
df_train_numeric = df_train.select_dtypes(include=['number', 'bool'])#.drop(columns= 'change_auswirkung')
df_val_numeric = df_val.select_dtypes(include=['number', 'bool'])#.drop(columns= 'change_auswirkung')

In [16]:
etc_feature_names = df_train_numeric.columns.to_list()

In [17]:
def tokenizer(txt):
    return txt.split()

vectorizer_details = TfidfVectorizer(ngram_range=(1, 3), tokenizer=tokenizer)
vectorizer_details.fit(df_train['combined_text'])
X_train_vec = vectorizer_details.transform(df_train['combined_text'])

X_eval_vec = vectorizer_details.transform(df_val['combined_text'])



In [18]:
X_train_vec2 = np.array(df_train_numeric).astype(float)
X_eval_vec2 = np.array(df_val_numeric).astype(float)

In [19]:
# Concatenate the sparse vectors horizontally
X_train_combined = hstack([X_train_vec, X_train_vec2])
X_eval_combined = hstack([X_eval_vec, X_eval_vec2])


In [20]:
X_train_combined.shape

(177940, 3263380)

In [21]:
y_train = df_train['change_auswirkung']
y_eval = df_val['change_auswirkung']

In [22]:
y_eval.value_counts()

change_auswirkung
4      28956
3       8455
1|2     5770
5       1305
Name: count, dtype: int64

In [23]:
model.fit(X_train_combined, y_train)



In [24]:
prediction = model.predict(X_eval_combined)

In [25]:
report = sklearn.metrics.classification_report(y_eval, prediction, output_dict=True)

In [26]:
print(report['accuracy'])

0.9754979094546599


'accuracy': 0.9139954142876411 (ohne vorlauf)  
'accuracy': 0.9141977251270063 (+ vorlauf_hlog)  
'accuracy': 0.9141078091983995 (+ vorlauf_htransf)  
'accuracy': 0.9754979094546599 (+ change_ausw unscaled)

In [27]:
# Create a DataFrame from the confusion matrix using crosstab
confusion_matrix = pd.crosstab(np.array(y_eval), np.array(prediction),
                               rownames=['Actual'], colnames=['Predicted'])

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix)

Confusion Matrix:
Predicted   1|2     3      4     5
Actual                            
1|2        5652   118      0     0
3            21  8007    427     0
4             0   164  28734    58
5             0     0    302  1003


In [28]:
tensor_logger = TensorLogger(logdir=config.tensor_log_dir)
tensor_logger.log_report(report=report, step=version)
tensor_logger.log_text(version_descr, version)

In [29]:
text_keywords = vectorizer_details.get_feature_names_out().tolist()
feature_names_dict = {'etc': etc_feature_names, 'text_comb': text_keywords}
joblib.dump(feature_names_dict, config.feature_names_path(report['accuracy']), compress=('gzip', 3))

['..\\output/feature_names_v-1_acc-0.975.joblib.gz']

In [30]:
# Save the model to a file
joblib.dump(model, config.model_path(report['accuracy']))
joblib.dump(vectorizer_details, config.vectorizer_path(report['accuracy']))

['..\\output/vectorizer_v-1_acc-0.975.joblib']

In [31]:
df_val.drop(columns=combined_text_cols).to_csv("data_output/val_data.csv", index=False)