In [1]:
!pip install tensorflow
!pip install swifter
import swifter
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
import time
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pickle



In [2]:
!pip show tensorflow

Name: tensorflow
Version: 2.16.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: C:\Users\PCF\AppData\Roaming\Python\Python311\site-packages
Requires: tensorflow-intel
Required-by: 


In [3]:
max_len = 256
batch_size = 64
num_epochs = 3
max_words = 10000
num_labels = 2

In [4]:
# def pretrain_bert(all_cloud, tokenizer, max_len, batch_size, num_epochs):
#     model = BertForMaskedLM.from_pretrained('bert-base-uncased')
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)

#     dataset = MyDataset(all_cloud, tokenizer, max_len)
#     train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    
#     total_batches = len(train_loader)
    
#     for epoch in range(num_epochs):
#         t = time.time()
#         model.train()
#         total_loss = 0.0
#         for batch_idx, batch in enumerate(train_loader):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)
            
#             optimizer.zero_grad()
#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
            
#             if (batch_idx + 1) % 100 == 0:
#                 batches_done = (epoch * total_batches) + (batch_idx + 1)
#                 batches_left = num_epochs * total_batches - batches_done
#                 print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{total_batches}], Loss: {loss.item()}, Time taken: {time.time() - t}')
#                 print(f'{batches_done} batches done, {batches_left} batches left')

#         total_loss /= total_batches
#         print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}, Time taken: {time.time() - t}')

#     return model


### Dataset Preparation

In [5]:
# helper functions
def merge_dfs(df1, df2):
    merged = pd.concat([df1, df2], ignore_index=True)
    return merged

# find all SP keywords
def get_sp_keywords(df):
    df['keywords_found'] = df['keywords_found'].fillna('')
    all_keywords = df['keywords_found'].str.split().explode()
    unique_keywords = all_keywords.unique()
    return unique_keywords

# all rows from df where keyword is in title or tag
def make_SP_train_dataset(df, SP_keywords):
    df['processed_title'] = df['processed_title'].fillna('').astype(str)
    SP_keywords_set = set(SP_keywords)

    def row_contains_keyword(row):
        title_words = row['processed_title'].split()
        tag_words = row['processed_tags'].split()
        title_contains = any(keyword in title_words for keyword in SP_keywords_set)
        tags_contains = any(keyword in tag_words for keyword in SP_keywords_set)
        return title_contains or tags_contains
    
    filtered_df = df[df.apply(row_contains_keyword, axis=1)]
    return filtered_df

# equal amount of posts from all cloud not in our relevant df
def make_non_SP_train_dataset(big_df, SP_df):
    non_SP_ids = set(big_df['id']) - set(SP_df['id'])
    non_SP_df = big_df[big_df['id'].isin(non_SP_ids)]
    non_SP_df = non_SP_df.sample(n=93178, random_state=42)  
    return non_SP_df


def add_labels(sp, nonsp):
    sp = sp.assign(label='SP')
    nonsp = nonsp.assign(label='nonSP')
    return sp, nonsp

In [6]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

# Ensure that necessary NLTK datasets are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize Snowball stemmer
stemmer = SnowballStemmer('english')

# Regular expression patterns (same as before)
code_snippet_pattern = re.compile(r'<code>.*?</code>')  # Detects code snippets
url_pattern = re.compile(r'https?://\S+|www\.\S+')  # Detects URLs
number_pattern = re.compile(r'\b\d+\b')  # Detects standalone numbers
extended_single_char_pattern = re.compile(r'\b\w\b|\w(?=\d)|\d(?=\w)')  # Detects single characters and numbers attached to words

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    
    # Remove code snippets and HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    code_elements = soup.find_all(['code', 'pre', 'a', 'img'])

    # Remove code elements from the HTML
    for code_element in code_elements:
        code_element.decompose()

    # Get the cleaned HTML without code snippets
    text = soup.prettify()

    text = BeautifulSoup(text, 'lxml').get_text()

    # Convert text to lowercase
    text = text.lower()

    # Remove URLs, numbers, and single characters
    text = url_pattern.sub(' ', text)
    text = number_pattern.sub(' ', text)
    text = extended_single_char_pattern.sub(' ', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords and non-alphabetic characters, apply stemming
    stop_words = set(stopwords.words('english'))
    tokens = [stemmer.stem(word) if len(word) > 3 else word for word in tokens if word.isalpha() and word not in stop_words and word != '' and len(word) > 1]

    return ' '.join(tokens)

def preprocess_tags(tags_text):
    if pd.isnull(tags_text):  
        return ""
    tags_text = tags_text.replace("|", " ")
    return preprocess_text(tags_text)

def preprocess(df):
    
    for col in ['processed_title', 'processed_body', 'processed_tags']:
        #  for col in ['processed_title', 'processed_body', 'processed_tags']:
        df[col] = df[col].fillna('').astype(str)
        new_col = f'processed_{col}'
        df[new_col] = df[col].swifter.apply(preprocess_text)
    
    # Processing tags with the new function
    # df['processed_tags'] = df['tags'].swifter.apply(preprocess_tags)
    df['sumair_text'] = df['processed_title'] + ' ' + df['processed_body'] + ' ' + df['processed_tags']
    df['sumair_text'] = df['sumair_text'].fillna('').astype(str)
    df['sumair_text'] = df['sumair_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
    return df



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PCF\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PCF\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PCF\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Model training

In [7]:
def make_sequences_and_labels(df):
    texts = (df['sumair_text']).tolist()
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    label_to_int = {"SP": 1, "nonSP": 0}
    df['label'] = df['label'].map(label_to_int)
    return padded_sequences, tokenizer

def training(padded_sequences, df, max_words, max_len, tokenizer):
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.1, random_state=42)
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.5),
        Bidirectional(LSTM(32)),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

    model.save('C:\\Users\\PCF\\Downloads\\SO model\\model.h5')
    with open('C:\\Users\\PCF\\Downloads\\SO model\\tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle)


In [8]:
# # prepare data for model

# def load_model_and_tokenizer(num_labels):
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
#     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = num_labels)
#     return model, tokenizer

# def make_tokenized_dataset(df, tokenizer):
#     df['processed_title'] = df['processed_title'].fillna('').astype(str)
#     texts = (df['processed_title'] + df['processed_body'] + df['processed_tags']).tolist()
#     tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=256, truncation=True) for text in texts]
#     padded_texts = [tokenized_text + [0] * (max_len - len(tokenized_text)) for tokenized_text in tokenized_texts]
#     return padded_texts

# def make_tensordataset(padded_texts, df):
#     input_ids = torch.tensor(padded_texts)

#     attention_masks = [[1 if token != 0 else 0 for token in text] for text in padded_texts]
#     attention_masks = torch.tensor(attention_masks)

#     labels = df['label'].tolist()
#     unique_labels = list(set(labels)) 
#     label_map = {label: idx for idx, label in enumerate(unique_labels)}  
#     label_ids = [label_map[label] for label in labels]
#     labels_tensor = torch.tensor(label_ids)

#     dataset = TensorDataset(input_ids, attention_masks, labels_tensor)

#     return dataset
        
# def make_dataloaders(dataset):
#     train_size = 0.8
#     train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=42)
#     train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9, random_state=42)
#     train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
#     test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
#     val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)
#     return train_dataloader, val_dataloader, test_dataloader
        


In [9]:
# # fine tune model
# def fine_tuning(model, train_dataloader, val_dataloader):
#     train_losses = []
#     val_losses = []

#     optimizer = AdamW(model.parameters(), lr=1e-5)

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)

#     num_epochs = 3
#     for epoch in range(num_epochs):
#         t = time.time()
#         model.train()
#         total_train_loss = 0.0
#         for idx, batch in enumerate(train_dataloader):
#             input_ids, attention_mask, lbls = batch
#             input_ids, attention_mask, lbls = (
#                 input_ids.to(device),
#                 attention_mask.to(device),
#                 lbls.to(device)
#             )

#             optimizer.zero_grad()
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=lbls)
#             loss = outputs.loss
#             loss.backward()
#             optimizer.step()

#             total_train_loss += loss.item()

#             if idx % 100 == 0:
#                 print('training batch ', idx, ', time taken', time.time() - t)

#         avg_train_loss = total_train_loss / len(train_dataloader)
#         train_losses.append(avg_train_loss)

#         # validation
#         model.eval()
#         total_val_loss = 0.0
#         for idx, val_batch in enumerate(val_dataloader):
#             val_input_ids, val_attention_mask, val_labels = val_batch
#             val_input_ids, val_attention_mask, val_labels = (
#                 val_input_ids.to(device),
#                 val_attention_mask.to(device),
#                 val_labels.to(device)
#             )

#             with torch.no_grad():
#                 outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

#             val_loss = outputs.loss
#             total_val_loss += val_loss.item()

#             if idx % 100 == 0:
#                 print('val batch ', idx, ', time taken', time.time() - t)

#         avg_val_loss = total_val_loss / len(val_dataloader)
#         val_losses.append(avg_val_loss)

#         print(f"Epoch {epoch + 1}: Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

#     return model



### self testing

In [10]:
# load data
pre2018 = pd.read_csv("C:\\Users\\PCF\\Downloads\\SO model\\pre2018_processed.csv")
post2018 = pd.read_csv("C:\\Users\\PCF\\Downloads\\SO model\\post2018_processed.csv")
all_cloud = merge_dfs(pre2018, post2018)

relevant_data = pd.read_csv('C:\\Users\\PCF\\Desktop\\sproj stuff\\datasets\\50_annotated_data.csv')
relevant_ids = set(relevant_data['id'])
relevant_df = all_cloud[all_cloud['id'].isin(relevant_ids)] 

  relevant_data = pd.read_csv('C:\\Users\\PCF\\Desktop\\sproj stuff\\datasets\\50_annotated_data.csv')


In [11]:
# keywords
keywords = get_sp_keywords(relevant_data)

In [12]:
SP_data = make_SP_train_dataset(relevant_df, keywords)
SP_data = SP_data[['id', 'processed_title', 'processed_body', 'processed_tags']]

non_SP_data = make_non_SP_train_dataset(all_cloud, relevant_data)

SP_data, non_SP_data = add_labels(SP_data, non_SP_data)

labeled_data = merge_dfs(SP_data, non_SP_data)

labeled_data = preprocess(labeled_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_title'] = df['processed_title'].fillna('').astype(str)


Pandas Apply:   0%|          | 0/186356 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/186356 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/186356 [00:00<?, ?it/s]

In [13]:
padded_sequences, tokenizer = make_sequences_and_labels(labeled_data)
training(padded_sequences, labeled_data, max_words, max_len, tokenizer)

Epoch 1/10




[1m4193/4193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 78ms/step - accuracy: 0.9290 - loss: 0.1797 - val_accuracy: 0.9950 - val_loss: 0.0247
Epoch 2/10
[1m4193/4193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 82ms/step - accuracy: 0.9958 - loss: 0.0210 - val_accuracy: 0.9965 - val_loss: 0.0164
Epoch 3/10
[1m4193/4193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 86ms/step - accuracy: 0.9964 - loss: 0.0153 - val_accuracy: 0.9956 - val_loss: 0.0220
Epoch 4/10
[1m4193/4193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 89ms/step - accuracy: 0.9975 - loss: 0.0111 - val_accuracy: 0.9964 - val_loss: 0.0221
Epoch 5/10
[1m4193/4193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 90ms/step - accuracy: 0.9984 - loss: 0.0068 - val_accuracy: 0.9964 - val_loss: 0.0271
Epoch 6/10
[1m4193/4193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 92ms/step - accuracy: 0.9988 - loss: 0.0053 - val_accuracy: 0.9963 - val_loss: 0.0229
Epoch 7/1




