<a href="https://colab.research.google.com/github/vitsiupia/projektPython/blob/main/prosty_model_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# Pobierzmy dane i zobaczmy jak wyglądają
!wget 'https://github.com/vitsiupia/projektPython/raw/main/meetings_split.zip'

--2023-05-15 18:52:06--  https://github.com/vitsiupia/projektPython/raw/main/meetings_split.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/vitsiupia/projektPython/main/meetings_split.zip [following]
--2023-05-15 18:52:06--  https://raw.githubusercontent.com/vitsiupia/projektPython/main/meetings_split.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1027422 (1003K) [application/zip]
Saving to: ‘meetings_split.zip.1’


2023-05-15 18:52:06 (18.9 MB/s) - ‘meetings_split.zip.1’ saved [1027422/1027422]



In [25]:
import zipfile
with zipfile.ZipFile('meetings_split.zip', 'r') as zip:
  zip.extractall()

## Wizualizacja danych

Zobaczmy z czym pracujemy

In [26]:
import os
import random
import textwrap

def visualize_data(folder):
    # Sprawdzenie czy folder istnieje
    if not os.path.exists(folder):
        print(f"Folder '{folder}' nie istnieje.")
        return

    transcripts = []
    summaries = []

    # Przejście przez pliki w folderze
    for filename in os.listdir(folder):
        if filename.endswith('.transcript.txt'):
            transcripts.append(filename)
        elif filename.endswith('.abssumm.txt'):
            summaries.append(filename)

    # Sprawdzenie czy istnieją zarówno transkrypty jak i podsumowania
    if not transcripts or not summaries:
        print("Brak zarówno transkryptów jak i podsumowań w folderze.")
        return

    # Losowe wybranie pliku
    random_file = random.choice(transcripts + summaries)

    # Wybieranie transkryptu i podsumowania na podstawie losowego pliku
    if random_file.endswith('.transcript.txt'):
        transcript_file = random_file
        summary_file = random_file.replace('.transcript.txt', '.abssumm.txt')
    else:
        summary_file = random_file
        transcript_file = random_file.replace('.abssumm.txt', '.transcript.txt')

    # Odczytanie treści transkryptu
    with open(os.path.join(folder, transcript_file), 'r', encoding='utf-8') as file:
        transcript_text = file.read()

    # Odczytanie treści podsumowania
    with open(os.path.join(folder, summary_file), 'r', encoding='utf-8') as file:
        summary_text = file.read()

    # Wyświetlanie transkryptu
    print(f"Transkrypt {transcript_file.split('.')[0]}:")
    lines = textwrap.wrap(transcript_text, width=80)
    for line in lines:
        print(line)

    print()

    # Wyświetlanie podsumowania
    print("Podsumowanie:")
    lines = textwrap.wrap(summary_text, width=80)
    for line in lines:
        print(line)

In [27]:
visualize_data('meetings_split/train')

Transkrypt TS3003d:
well richard came default spot onoff button mute button theres volume channel
selectors simple plusminus button thought help button hold press another button
help goes lcd screen theres zero one zero buttons button teletext button
subtitles company logo rather simple prototype well see testing users take sure
pop time think look controls theyve got buttons well onoff button necessity cant
drop one volume channel buttons need obviously need mute button could replaced
pressing volumedown button twice could cancel one think help button really
necessary theres way know someone wants know button well cant leave number
buttons guess rather basic already theyre rather hard draw whiteboard theyre
supposed equal sized round little logo volume triangle stuff recognition
materials weve chosen rubber buttons different colour case jump thats yes yes
thats thats big cost leave display also save money chip isnt docking station
isnt even c schematic even taken price could save mone

## Tworzenie prostego modelu NLP

In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model

def create_text_to_text_model(input_vocab_size, output_vocab_size, input_max_len, output_max_len, hidden_units):
    # Warstwa kodująca (Encoder)
    encoder_input = Input(shape=(input_max_len,))
    encoder_embedding = Embedding(input_vocab_size, hidden_units, mask_zero=True)(encoder_input)
    encoder_lstm = LSTM(hidden_units, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_state = [state_h, state_c]

    # Warstwa dekodująca (Decoder)
    decoder_input = Input(shape=(output_max_len,))
    decoder_embedding = Embedding(output_vocab_size, hidden_units, mask_zero=True)(decoder_input)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_state)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_output = decoder_dense(decoder_output)

    # Tworzenie modelu
    model = Model([encoder_input, decoder_input], decoder_output)
    return model

# Przykładowe użycie
input_vocab_size = 10000
output_vocab_size = 8000
input_max_len = 100
output_max_len = 80
hidden_units = 256

model = create_text_to_text_model(input_vocab_size, output_vocab_size, input_max_len, output_max_len, hidden_units)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 80)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 100, 256)     2560000     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 80, 256)      2048000     ['input_4[0][0]']                
                                                                                            

## Przygotowanie danych do ich wejścia do modelu

In [12]:
# Ścieżki do folderu z danymi treningowymi i walidacyjnymi
train_data_dir = "meetings_split/train"
val_data_dir = "meetings_split/val"

In [20]:
import glob
import os

def load_data(transcript_files):
    transcripts = []
    summaries = []

    for t_file in transcript_files:
        t_filename = os.path.splitext(os.path.basename(t_file))[0]
        s_file = os.path.join(os.path.dirname(t_file), t_filename + ".abssumm.txt")

        if os.path.isfile(s_file):
            with open(t_file, 'r', encoding='utf-8') as f:
                transcript = f.read().strip()
                transcripts.append(transcript)

            with open(s_file, 'r', encoding='utf-8') as f:
                summary = f.read().strip()
                summaries.append(summary)

    return transcripts, summaries

train_transcripts = glob.glob("meetings_split/train/*.txt")
train_transcripts = [t for t in train_transcripts if ".abssumm" not in t]
train_summaries = [t.replace('.transcript', '.abssumm') for t in train_transcripts]

val_transcripts = glob.glob("meetings_split/val/*.txt")
val_transcripts = [t for t in val_transcripts if ".abssumm" not in t]
val_summaries = [t.replace('.transcript', '.abssumm') for t in val_transcripts]

train_transcripts, train_summaries = load_data(train_transcripts)
val_transcripts, val_summaries = load_data(val_transcripts)

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_transcripts + train_summaries)

In [22]:
train_transcripts_encoded = tokenizer.texts_to_sequences(train_transcripts)
train_summaries_encoded = tokenizer.texts_to_sequences(train_summaries)

val_transcripts_encoded = tokenizer.texts_to_sequences(val_transcripts)
val_summaries_encoded = tokenizer.texts_to_sequences(val_summaries)


In [23]:
max_sequence_length = 100

from tensorflow.keras.preprocessing.sequence import pad_sequences

train_transcripts_padded = pad_sequences(train_transcripts_encoded, maxlen=max_sequence_length, padding='post')
train_summaries_padded = pad_sequences(train_summaries_encoded, maxlen=max_sequence_length, padding='post')

val_transcripts_padded = pad_sequences(val_transcripts_encoded, maxlen=max_sequence_length, padding='post')
val_summaries_padded = pad_sequences(val_summaries_encoded, maxlen=max_sequence_length, padding='post')

## Kompilacja i trenowanie modelu

In [None]:
from tensorflow import keras

# Kompilacja modelu
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Przetrenowanie modelu
history = model.fit(train_inputs, train_outputs, validation_data=(val_inputs, val_outputs), batch_size=32, epochs=10)

# Ocenianie modelu
test_loss, test_accuracy = model.evaluate(test_inputs, test_outputs)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)