<a href="https://colab.research.google.com/github/vitsiupia/projektPython/blob/main/prosty_model_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Pobierzmy dane i zobaczmy jak wyglądają
!wget 'https://github.com/vitsiupia/projektPython/raw/main/meetings_split.zip'

--2023-05-16 19:57:47--  https://github.com/vitsiupia/projektPython/raw/main/meetings_split.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/vitsiupia/projektPython/main/meetings_split.zip [following]
--2023-05-16 19:57:47--  https://raw.githubusercontent.com/vitsiupia/projektPython/main/meetings_split.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1027422 (1003K) [application/zip]
Saving to: ‘meetings_split.zip.1’


2023-05-16 19:57:47 (55.2 MB/s) - ‘meetings_split.zip.1’ saved [1027422/1027422]



In [12]:
import zipfile
with zipfile.ZipFile('meetings_split.zip', 'r') as zip:
  zip.extractall()

## Wizualizacja danych

Zobaczmy z czym pracujemy

In [13]:
import os
import random
import textwrap

def visualize_data(folder):
    # Sprawdzenie czy folder istnieje
    if not os.path.exists(folder):
        print(f"Folder '{folder}' nie istnieje.")
        return

    transcripts = []
    summaries = []

    # Przejście przez pliki w folderze
    for filename in os.listdir(folder):
        if filename.endswith('.transcript.txt'):
            transcripts.append(filename)
        elif filename.endswith('.abssumm.txt'):
            summaries.append(filename)

    # Sprawdzenie czy istnieją zarówno transkrypty jak i podsumowania
    if not transcripts or not summaries:
        print("Brak zarówno transkryptów jak i podsumowań w folderze.")
        return

    # Losowe wybranie pliku
    random_file = random.choice(transcripts + summaries)

    # Wybieranie transkryptu i podsumowania na podstawie losowego pliku
    if random_file.endswith('.transcript.txt'):
        transcript_file = random_file
        summary_file = random_file.replace('.transcript.txt', '.abssumm.txt')
    else:
        summary_file = random_file
        transcript_file = random_file.replace('.abssumm.txt', '.transcript.txt')

    # Odczytanie treści transkryptu
    with open(os.path.join(folder, transcript_file), 'r', encoding='utf-8') as file:
        transcript_text = file.read()

    # Odczytanie treści podsumowania
    with open(os.path.join(folder, summary_file), 'r', encoding='utf-8') as file:
        summary_text = file.read()

    # Wyświetlanie transkryptu
    print(f"Transkrypt {transcript_file.split('.')[0]}:")
    lines = textwrap.wrap(transcript_text, width=80)
    for line in lines:
        print(line)

    print()

    # Wyświetlanie podsumowania
    print("Podsumowanie:")
    lines = textwrap.wrap(summary_text, width=80)
    for line in lines:
        print(line)

In [14]:
visualize_data('meetings_split/train')

Transkrypt TS3011b:
time cup coffee get cup coffee well work time either think yes excuse still
right thing well changes design requirements changes method also basically
device send messages tv set easy way sending pulses infrared light tv set well
th tried implement picture hardly readable see well theres energy source
basically connected three things user interface connected chip connected sender
generates messages using infrared light sent receiver thats basically idea
theres little picture imagination device look look found usually kind things
consist battery infrared diode buttons chips circuit board thats cased together
nothing almost every piece equipment every piece every tv set controlled
infrared exceptions infrared controls luxury remote controls lithium buttons
think thats w yes little little bit fancy also maybe consider basic scheme
things implemented basically theres one chip buttons connected buttons lit whole
thing transmitted infrared li diode power source thats basi

## Tworzenie prostego modelu NLP

In [38]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model

def create_text_to_text_model(input_vocab_size, output_vocab_size, input_max_len, output_max_len, hidden_units):
    # Warstwa kodująca (Encoder)
    encoder_input = Input(shape=(input_max_len,))
    encoder_embedding = Embedding(input_vocab_size, hidden_units, mask_zero=True)(encoder_input)
    encoder_lstm = LSTM(hidden_units, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_state = [state_h, state_c]

    # Warstwa dekodująca (Decoder)
    decoder_input = Input(shape=(output_max_len,))
    decoder_embedding = Embedding(output_vocab_size, hidden_units, mask_zero=True)(decoder_input)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_state)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_output = decoder_dense(decoder_output)

    # Tworzenie modelu
    model = Model([encoder_input, decoder_input], decoder_output)
    return model

# Przykładowe użycie
input_vocab_size = 3000
output_vocab_size = 2000
input_max_len = 3900
output_max_len = 220
hidden_units = 256

model = create_text_to_text_model(input_vocab_size, output_vocab_size, input_max_len, output_max_len, hidden_units)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 3900)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 220)]        0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 3900, 256)    768000      ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 220, 256)     512000      ['input_6[0][0]']                
                                                                                            

## Przygotowanie danych do ich wejścia do modelu

In [16]:
# Ścieżki do folderu z danymi treningowymi i walidacyjnymi
train_data_dir = "meetings_split/train"
val_data_dir = "meetings_split/val"

In [17]:
import glob
import os

train_transcripts_list = glob.glob("meetings_split/train/*.txt")
train_transcripts_list = [t for t in train_transcripts_list if ".abssumm" not in t]
train_summaries_list = [t.replace('.transcript', '.abssumm') for t in train_transcripts_list]

val_transcripts_list = glob.glob("meetings_split/val/*.txt")
val_transcripts_list = [t for t in val_transcripts_list if ".abssumm" not in t]
val_summaries_list = [t.replace('.transcript', '.abssumm') for t in val_transcripts_list]

# Wyświetlenie ścieżek do plików, aby sprawdzić poprawność
print(train_transcripts_list)
print(train_summaries_list)
print(val_transcripts_list)
print(val_summaries_list)

['meetings_split/train/ES2012a.transcript.txt', 'meetings_split/train/ES2007d.transcript.txt', 'meetings_split/train/ES2016d.transcript.txt', 'meetings_split/train/TS3008d.transcript.txt', 'meetings_split/train/TS3004a.transcript.txt', 'meetings_split/train/TS3012d.transcript.txt', 'meetings_split/train/TS3003c.transcript.txt', 'meetings_split/train/ES2016b.transcript.txt', 'meetings_split/train/IS1002c.transcript.txt', 'meetings_split/train/TS3007a.transcript.txt', 'meetings_split/train/IS1006d.transcript.txt', 'meetings_split/train/IS1000a.transcript.txt', 'meetings_split/train/TS3011b.transcript.txt', 'meetings_split/train/TS3011c.transcript.txt', 'meetings_split/train/TS3008b.transcript.txt', 'meetings_split/train/TS3010a.transcript.txt', 'meetings_split/train/TS3005c.transcript.txt', 'meetings_split/train/ES2012d.transcript.txt', 'meetings_split/train/ES2011d.transcript.txt', 'meetings_split/train/ES2006b.transcript.txt', 'meetings_split/train/IS1005a.transcript.txt', 'meetings_sp

In [27]:
import numpy as np

# Funkcja do odczytywania zawartości plików tekstowych
def read_text_files(file_paths):
    texts = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().strip()
            texts.append(text)
    return texts

# Odczytanie zawartości plików train_transcripts i train_summaries
train_transcripts_texts = read_text_files(train_transcripts_list)
train_summaries_texts = read_text_files(train_summaries_list)

# Odczytanie zawartości plików val_transcripts i val_summaries
val_transcripts_texts = read_text_files(val_transcripts_list)
val_summaries_texts = read_text_files(val_summaries_list)

# Tworzenie tensorów dla train_transcripts i train_summaries
train_transcripts_tensor = np.array([train_transcripts_texts, train_summaries_texts]).T

# Tworzenie tensorów dla val_transcripts i val_summaries
val_transcripts_tensor = np.array([val_transcripts_texts, val_summaries_texts]).T


In [None]:
train_transcripts_tensor[0]

## Kompilacja i trenowanie modelu

In [34]:
# Jak duży zrobić padding?
def find_longest_sequence(transcripts, summaries):
    max_transcript_length = 0
    max_summary_length = 0

    for i in range(len(transcripts)):
        with open(transcripts[i], 'r', encoding='utf-8') as t_file:
            transcript = t_file.read().strip()
        with open(summaries[i], 'r', encoding='utf-8') as s_file:
            summary = s_file.read().strip()

        transcript_length = len(transcript.split())
        summary_length = len(summary.split())

        if transcript_length > max_transcript_length:
            max_transcript_length = transcript_length

        if summary_length > max_summary_length:
            max_summary_length = summary_length

    return max_transcript_length, max_summary_length

# Wywołanie funkcji dla Twoich danych
max_transcript_length, max_summary_length = find_longest_sequence(train_transcripts_list, train_summaries_list)

print("Najdłuższy transkrypt ma", max_transcript_length, "słów.")
print("Najdłuższe podsumowanie ma", max_summary_length, "słów.")

Najdłuższy transkrypt ma 3870 słów.
Najdłuższe podsumowanie ma 201 słów.


In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Inicjalizacja tokenizera
tokenizer = Tokenizer()

# Dopasowanie tokenizera do tekstów transkryptów i podsumowań
texts = np.concatenate([train_transcripts_tensor[:, 0], train_transcripts_tensor[:, 1]])
tokenizer.fit_on_texts(texts)

# Przekształcenie tekstów na sekwencje indeksów tokenów
train_transcripts_sequences = tokenizer.texts_to_sequences(train_transcripts_tensor[:, 0])
train_summaries_sequences = tokenizer.texts_to_sequences(train_transcripts_tensor[:, 1])
val_transcripts_sequences = tokenizer.texts_to_sequences(val_transcripts_tensor[:, 0])
val_summaries_sequences = tokenizer.texts_to_sequences(val_transcripts_tensor[:, 1])

# Zastosowanie paddingu do sekwencji
max_transcript_length = 3900  # Dostosuj wartość do oczekiwanej maksymalnej długości sekwencji
max_summary_length = 220
train_transcripts_padded = pad_sequences(train_transcripts_sequences, maxlen=max_transcript_length, padding='post')
train_summaries_padded = pad_sequences(train_summaries_sequences, maxlen=max_summary_length, padding='post')
val_transcripts_padded = pad_sequences(val_transcripts_sequences, maxlen=max_transcript_length, padding='post')
val_summaries_padded = pad_sequences(val_summaries_sequences, maxlen=max_summary_length, padding='post')

In [36]:
train_labels = []
val_labels = []

train_labels = train_summaries_padded[:, 0:]
val_labels = val_summaries_padded[:, 0:]

In [39]:
# Kompilacja modelu
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Przetrenowanie modelu
history = model.fit(
    [train_transcripts_padded, train_summaries_padded],
    train_labels,
    validation_data=([val_transcripts_padded, val_summaries_padded], val_labels),
    batch_size=32,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
