In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.svm import SVC

In [None]:
def load_data(folder_path, label):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(folder_path, file_name), 'r') as file:
                data.append({'text': file.read(), 'label': label})
    return data

good_transcripts = load_data('transcripts_2.0', label=1)
bad_transcripts = load_data('transcripts_bad', label=0)

data = pd.DataFrame(good_transcripts + bad_transcripts)
texts = data['text']
labels = data['label']

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = SVC(kernel='linear')
#model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

class TranscriptDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'label': torch.tensor(label, dtype=torch.long)}

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
pretrained_model = AutoModel.from_pretrained("distilbert-base-uncased")

train_dataset = TranscriptDataset(X_train, y_train, tokenizer, max_length=128)
test_dataset = TranscriptDataset(X_test, y_test, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

def extract_embeddings(data_loader, model):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
    return np.vstack(embeddings)

X_train_embeddings = extract_embeddings(train_loader, pretrained_model)
X_test_embeddings = extract_embeddings(test_loader, pretrained_model)

svm_model = SVC(kernel='linear')
#svm_model = RandomForestClassifier()
svm_model.fit(X_train_embeddings, y_train)

y_pred_svm = svm_model.predict(X_test_embeddings)
print(classification_report(y_test, y_pred_svm))

In [None]:
from gensim.models import KeyedVectors

fasttext_vectors = KeyedVectors.load_word2vec_format('cc.en.300.vec.gz', binary=False)  # Download "cc.en.300.vec.gz" from FastText site

def get_fasttext_embeddings(texts, model):
    embeddings = []
    for text in texts:
        words = text.split()
        word_vectors = [model[word] for word in words if word in model]
        if word_vectors:  # If there are valid word vectors
            sentence_vector = np.mean(word_vectors, axis=0)  # Average word embeddings
        else:  # If no valid words are in the model
            sentence_vector = np.zeros(model.vector_size)
        embeddings.append(sentence_vector)
    return np.array(embeddings)

X_train_fasttext = get_fasttext_embeddings(X_train, fasttext_vectors)
X_test_fasttext = get_fasttext_embeddings(X_test, fasttext_vectors)

svm_model_fasttext = SVC(kernel='linear')
#svm_model_fasttext = RandomForestClassifier()
svm_model_fasttext.fit(X_train_fasttext, y_train)
y_pred_fasttext = svm_model_fasttext.predict(X_test_fasttext)
print("FastText SVM Classification Report:")
print(classification_report(y_test, y_pred_fasttext))
