# Reading the Translation Memory Data

In [None]:
import os
import re
import time
import math
import random
import unicodedata
import numpy as np
import pandas as pd
from tqdm import tqdm

import string
import spacy

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/21-food-safety-processed'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
CD = '../input/paralel-translation-corpus-in-22-languages/' #initially there were 22 languages, two more added afterwards :)

In [None]:
SL = 'EN' #this is a constant and should not be changed, i.e. Source Language is always English

In [None]:
TL = 'SV' #depending on the desired Target Language, this could be set, available abbr. choices are written in the introduction paragraph

In [None]:
df=pd.read_csv(CD+SL+'-'+TL+'/'+SL+'-'+TL+'.txt', sep='\t', header = None)[[0,1]].rename(columns = {0:SL, 1:TL})

In [None]:
df.head()

In [None]:
# float_column
df['EN'] = df['EN'].apply(lambda x: str(x))
df['SV'] = df['SV'].apply(lambda x: str(x))

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df = df.dropna() # Remove any rows with missing values
df = df.dropna(axis=1)

In [None]:
df.duplicated().sum()

In [None]:
# Count the number of duplicate rows
num_duplicates = df.duplicated().sum()
print("Number of duplicate rows:", num_duplicates)

# Remove duplicate rows
df = df.drop_duplicates()

# Verify the removal
num_duplicates_after_removal = df.duplicated().sum()
print("Number of duplicate rows after removal:", num_duplicates_after_removal)

In [None]:
# Drop the 'is_food_related' column
df = df.drop(columns='is_food_related')

# Check the result
df.head()

In [None]:
df.to_csv('Normal EN-SV food safety regulations.csv', index=False)

In [None]:
# Remove any leading or trailing spaces in the columns
df['EN'] = df['EN'].str.strip()
df['BG'] = df['BG'].str.strip()

# Remove any extra spaces from the text
df['EN'] = df['EN'].apply(lambda x: re.sub(' +', ' ', x))
df['BG'] = df['BG'].apply(lambda x: re.sub(' +', ' ', x))

# Remove text in square brackets using regular expressions
df['EN'] = df['EN'].str.replace(r'\[.*?\]', '', regex=True)
df['BG'] = df['BG'].str.replace(r'\[.*?\]', '', regex=True)


# Noise removal
df['EN'] = df['EN'].apply(lambda x: re.sub(r'<.*?>', '', x))  # Remove HTML tags
df['BG'] = df['BG'].apply(lambda x: re.sub(r'<.*?>', '', x)) 

# Normalization
df['EN'] = df['EN'].apply(lambda x: x.lower())  # Convert to lowercase
df['BG'] = df['BG'].apply(lambda x: x.lower())

# Punctuation handling (remove punctuation marks)
df['EN'] = df['EN'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['BG'] = df['BG'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Parallel data alignment (assuming equal number of English and German sentences)
df = df[:min(len(df['EN']), len(df['BG']))]  # Ensure equal number of English and German sentences


In [None]:
df.head()

In [None]:
# Define keywords for identifying food safety-related contents
food_keywords = [
     "food safety",
    "HACCP",
    "foodborne illness",
    "food contamination",
    "food handling",
    "food hygiene",
    "food processing",
    "food storage",
    "food labeling",
    "food regulations",
    "food inspection",
    "food safety standards",
    "food safety training",
    "food safety practices",
    "food safety guidelines",
    "food safety management",
    "food safety risks",
    "food safety procedures",
    "food safety audits",
    "food safety certifications",
    "food safety monitoring",
    "food safety control",
    "food safety protocols",
    "food safety compliance",
    "food safety regulations",
    "food safety best practices",
    "contamination",
    "food recalls",
    "sanitation",
    "cross-contamination",
    "microbiological hazards",
    "chemical hazards",
    "physical hazards",
    "allergen control",
    "temperature control",
    "good manufacturing practices",
    "quality assurance",
    "traceability",
    "risk assessment",
    "pathogens",
    "food safety auditing",
    "food safety culture"
]

def is_food_related(text):
    if isinstance(text, str):
        # Use regular expressions to find any of the food keywords in the text
        pattern = re.compile(r'\b(?:' + '|'.join(food_keywords) + r')\b', flags=re.IGNORECASE)
        return bool(pattern.search(text))
    else:
        return False

# Applying our function to the 'EN' and 'DE' columns
df['is_food_related'] = df['EN'].apply(is_food_related) | df['BG'].apply(is_food_related)

# Keeping only the rows where 'is_food_related' is True
df = df[df['is_food_related']]

# Check the result
df.head()
