In [5]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_md")

def tokenizer(id,fText,l1,l2):
    doc = nlp(fText)
    bdoc = nlp(l1)
    mdoc = nlp(l2)
    ### TOKENIZING ###, ### REMOVING STOP WORDS ### if not token.is_stop
    wList = [token for token in doc]
    bList = [token for token in bdoc]
    mList = [token for token in mdoc]
    ### NORMALIZING ###, ### STEMMING ###, ### LEMMATIZATION ###
    # wList = [token.lemma_ for token in wList]
    ### POS filtering ###
    df = pd.DataFrame(columns=['satzId','Wort','Attribut'], data = {'satzId': id, 'Wort': wList})
    #cast falls nötig von spacy token zu string konvertieren
    df['Wort'] = df['Wort'].astype(str)
    df['Attribut'] = "O"

    #search for all times the first word of the brand occures in the text
    b_brand = df.loc[df['Wort'].str.lower() == str(bList[0]).lower(), 'Attribut']

    #print(b_brand.index.asi8)

    # searching for the Brand Labels

    #if(len(b_brand.index.asi8) == 1):
    #    df.at[b_brand.index.asi8[0], 'Attribut'] = "B-Brand"
    if(len(bList)==1):
        df.loc[df['Wort'].str.lower() == str(bList[0]).lower(), 'Attribut'] = 'B-Brand'
    else:
        for x in range(len(b_brand.index.asi8)):
            #everytime the first Word of the Brand occurs in the Text we need to ckeck the following tokens if it really is the whole Brand Label
            df_to_check = df.iloc[b_brand.index.asi8[x]:b_brand.index.asi8[x]+len(bList)]
            #print(df_to_check)
            # amount of hits have to be the length of the Brand
            hits = 0
            for i, row in enumerate(df_to_check['Wort']):
                if(row.lower() == str(bList[i]).lower()): #hit if the word in the token is the same as in the brand label we search
                    hits += 1
                else:
                    break
            if(hits == len(bList)):
                df.iloc[b_brand.index.asi8[x]:b_brand.index.asi8[x]+len(bList)]['Attribut'] = "I-Brand"
                df.at[b_brand.index.asi8[x], 'Attribut'] = "B-Brand"
                df.at[b_brand.index.asi8[x]+len(bList)-1, 'Attribut'] = "E-Brand"

    #search for all times the first word of the brand occures in the text
    b_modelnumber = df.loc[df['Wort'].str.lower() == str(mList[0]).lower(), 'Attribut']


    if(len(mList) == 1):
        df.loc[df['Wort'].str.lower() == str(mList[0]).lower(), 'Attribut'] = "B-Modelnumber"
    else:
        for x in range(len(b_modelnumber.index.asi8)):
            #everytime the first Word of the Brand occurs in the Text we need to ckeck the following tokens if it really is the whole Brand Label
            df_to_check = df.iloc[b_modelnumber.index.asi8[x]:b_modelnumber.index.asi8[x]+len(mList)]
            #print(df_to_check)
            # amount of hits have to be the length of the Brand
            hits = 0
            for i, row in enumerate(df_to_check['Wort']):
                if(row.lower() == str(mList[i]).lower()): #hit if the word in the token is the same as in the brand label we search
                    hits += 1
                else:
                    break
            if(hits == len(mList)):
                df.iloc[b_modelnumber.index.asi8[x]:b_modelnumber.index.asi8[x]+len(mList)]['Attribut'] = "I-Modelnumber"
                df.at[b_modelnumber.index.asi8[x], 'Attribut'] = "B-Modelnumber"
                df.at[b_modelnumber.index.asi8[x]+len(mList)-1, 'Attribut'] = "E-Modelnumber"


    #for each in bList:
    #    df.loc[df['Wort'] == str(each), 'Attribut'] = "B-Brand"
    return df

from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Wort'].to_list()))
    else:
        vocab = list(set(data['Attribut'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Wort'].to_list())))
    n_tag = len(list(set(data['Attribut'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = tf.keras.preprocessing.sequence.pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = tf.keras.preprocessing.sequence.pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [tf.keras.utils.to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tags length:', len(train_tags),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags


In [6]:
ecsv = pd.read_csv('Daten/KoepckeEigen/electronicFixed.csv',escapechar="\\",sep=",",error_bad_lines=False,warn_bad_lines=False)
ecsv.dropna(axis = 0)
dList = []
df = pd.DataFrame()
for index, row in ecsv.sample(1000).iterrows():
    df = df.append(tokenizer(index, str(row['name']) + str(row['productdescription']), str(row['brand']), str(row['modelnumber'])))
    #df = df.append(tokenizer(index, str(row['name']), str(row['brand']), str(row['modelnumber'])))
    print(index)
#print(dList)
#print(dList.pop(0))
#df = pd.DataFrame(dList, columns=['Id', 'satzID', 'Wort', 'Attribut'])
print(df)
df.to_csv('Daten/KoepckeEigen/trainingData.csv', index = False)

3068
22889
15737
11968
47424
10546
31002
40301
6414
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[b_modelnumber.index.asi8[x]:b_modelnumber.index.asi8[x]+len(mList)]['Attribut'] = "I-Modelnumber"
43428
7446
5397
53337
28956
11363
36344
28692
33713
26301
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[b_brand.index.asi8[x]:b_brand.index.asi8[x]+len(bList)]['Attribut'] = "I-Brand"
38658
18051
40710
54841
33098
48532
15630
43389
20793
54932
34194
34083
28909
48589
1569
12509
2338
52774
21709
39608
54982
48912
11396
9526
14537
50739
32802
31228
2