# Bert Features generation

In [None]:
from transformers import BertTokenizer, BertTokenizerFast, BertModel
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load BERT-large-uncased model and tokenizer
model_name = 'bert-large-uncased'
model = BertModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = BertTokenizerFast.from_pretrained(model_name)

def generate_embeddings(text):
    embeddings = None
    # Tokenize and convert to tensors
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)  # Move inputs to GPU

    # Get word embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].float().cpu().numpy()

    # cls_embedding = outputs.last_hidden_state[0][0].cpu().numpy()
    # embedding_list.append(cls_embedding)
    return embeddings.tolist()

    # Extract last hidden states (CLS token representation)
    # return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()  # Move back to CPU for NumPy




Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
generate_embeddings('hello this is new world')

[[-0.09641173481941223,
  0.1565498262643814,
  -0.3278251886367798,
  -0.23063358664512634,
  0.3414245843887329,
  -0.1972936987876892,
  -0.06797582656145096,
  0.9755256175994873,
  0.4901845157146454,
  -0.010199072770774364,
  0.3325594365596771,
  -0.1634402871131897,
  -0.8641223311424255,
  -0.4655573070049286,
  -0.29880207777023315,
  -0.9363102912902832,
  -0.5367949604988098,
  -0.8110844492912292,
  -0.2858543395996094,
  0.013287018984556198,
  -0.1978355497121811,
  0.3591686189174652,
  -0.5128210783004761,
  -0.6325428485870361,
  0.060042548924684525,
  0.037911612540483475,
  0.8878594040870667,
  -0.40826547145843506,
  -0.2494295984506607,
  0.24060297012329102,
  -0.8760027885437012,
  0.38980749249458313,
  0.6314957737922668,
  -0.16029588878154755,
  -0.3064649999141693,
  0.407703697681427,
  0.008190267719328403,
  -0.553011953830719,
  0.03767715021967888,
  0.3137062191963196,
  0.34455329179763794,
  0.4551444351673126,
  0.2258879393339157,
  -0.67617976

# Lingustic Feature Generation

In [None]:
import re
import os
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from collections import Counter
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
nltk.download('punkt_tab')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Function to calculate Average Word Length (AWL)
def average_word_length(text):
    words = word_tokenize(text)
    if not words:
        return 0
    return sum(len(word) for word in words) / len(words)

# Function to calculate Pausality (PAU)
def pausality(text):
    pauses = text.count(',') + text.count('.') + text.count(';') + text.count(':')
    words = word_tokenize(text)
    return pauses / len(words) if words else 0

# Function to calculate Average Noun-Phrase Length (ANP)
def average_noun_phrase_length(text):
    blob = TextBlob(text)
    noun_phrases = blob.noun_phrases
    total_length = sum(len(phrase.split()) for phrase in noun_phrases)
    return total_length / len(noun_phrases) if noun_phrases else 0

# Function to calculate Average Sentence Length (ASL)
def average_sentence_length(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return len(words) / len(sentences) if sentences else 0

# Function to count Number of Clauses (NCL)
def number_of_clauses(text):
    clauses = re.split(r'[;:]', text)
    return len(clauses)

# Function to count Number of Words (NWO)
def number_of_words(text):
    return len(word_tokenize(text))

# Function to count Number of Verbs (NVB)
def number_of_verbs(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return sum(1 for word, tag in pos_tags if tag.startswith('VB'))

# Function to count Number of Adjectives (NAJ)
def number_of_adjectives(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return sum(1 for word, tag in pos_tags if tag.startswith('JJ'))

# Function to count Number of Passive Voice (NPV)
def number_of_passive_voice(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    passive_count = 0
    for i in range(len(pos_tags) - 1):
        if pos_tags[i][1] == 'VBN' and pos_tags[i + 1][1] in ['BY', 'IN']:
            passive_count += 1
    return passive_count

# Function to calculate Emotiveness (EMO)
def emotiveness(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Function to calculate Content Diversity (CDV)
def content_diversity(text):
  # Preprocess text: split into sentences
  sentences = sent_tokenize(text)

  # Initialize vectorizer with built-in English stopword removal
  vectorizer = CountVectorizer(stop_words='english')
  X = vectorizer.fit_transform(sentences)

  # Train LDA model
  lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
  lda_model.fit(X)

  # Extract feature (word) names
  feature_names = vectorizer.get_feature_names_out()

  # Get top words for each topic
  top_words = []
  num_top_words = 5
  for topic_idx, topic in enumerate(lda_model.components_):
      top_features = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
      top_words.extend(top_features)
      # print(f"Topic {topic_idx}: {top_features}")

  # Compute topic diversity
  total_top_words = len(top_words)
  unique_top_words = len(set(top_words))
  diversity = unique_top_words / total_top_words if total_top_words > 0 else 0
  # print(f"\nTopic Diversity: {diversity:.2f}")
  return diversity


# Function to calculate Redundancy (RED)
def redundancy(text):
    vectorizer = CountVectorizer()
    word_counts = vectorizer.fit_transform([text]).toarray()[0]
    return sum(1 for count in word_counts if count > 1) / len(word_counts) if word_counts.size else 0

# Function to calculate Lexical Diversity (LXD)
def lexical_diversity(text):
    words = word_tokenize(text)
    return len(set(words)) / len(words) if words else 0

# Function to count Number of Modal Verbs (NMV)
def number_of_modal_verbs(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    modal_verbs = ['can', 'could', 'may', 'might', 'must', 'shall', 'should', 'will', 'would']
    return sum(1 for word, tag in pos_tags if word.lower() in modal_verbs)

# Function to count Number of Typos (NTY)
def number_of_typos(text):
    words = word_tokenize(text)
    blob = TextBlob(text)
    return sum(1 for word in words if word not in blob.words)

def get_lingustics_features(text):
    result = {
        "pausality": pausality(text),
        "average_word_length": average_word_length(text),
        "average_noun_phrase_length": average_noun_phrase_length(text),
        "average_sentence_length": average_sentence_length(text),
        "number_of_clauses": number_of_clauses(text),
        "number_of_words": number_of_words(text),
        "number_of_verbs": number_of_verbs(text),
        "number_of_adjectives": number_of_adjectives(text),
        "number_of_passive_voice": number_of_passive_voice(text),
        "emotiveness": emotiveness(text),
        "content_diversity": content_diversity(text),
        "redundancy": redundancy(text),
        "lexical_diversity": lexical_diversity(text),
        "number_of_modal_verbs": number_of_modal_verbs(text),
        "number_of_typos" : number_of_typos(text)
    }
    return list(result.values())

In [None]:
text = "The quick brown fox jumps over the lazy dog."

In [None]:
get_lingustics_features(text)

[0.1,
 3.6,
 3.0,
 10.0,
 1,
 10,
 1,
 2,
 0,
 0.04166666666666666,
 0.6,
 0.125,
 1.0,
 0,
 1]

# Data Set Extraction

In [None]:
import csv

In [None]:
def read_merge_file(file_path):
    rows = []
    out_data = []
    label = ""

    with open(file_path, 'r', encoding='utf-8') as file:
        rows = csv.reader(file)

        # Iterate through rows
        for i, row in enumerate(rows):

          if i== 0: continue

          out_data.append({
              "review_text" : row[0],
              "label": row[1]
          })

    return out_data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
merge_file_path = '/content/drive/My Drive/dataset/merge_data.csv'
merge_data = read_merge_file(merge_file_path)
print(merge_data[0])

{'review_text': 'Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty', 'label': '0'}


# Feature Extraction

In [None]:
import csv
import random
import numpy as np
import pandas as pd

In [None]:
file_path = '/content/drive/My Drive/dataset/merge_data.csv'
data = read_merge_file(file_path)
# data = random.sample(data, 1000)
bert_features = []
linguistic_features = []
dataset_features = []

df = pd.DataFrame(data, columns=['review_text', 'label'])

for index, row in df.iterrows():
  try:
    text = row['review_text']
    word_embeddings = generate_embeddings(text)
    lingustic_result = get_lingustics_features(text)
    bert_features.append(word_embeddings)
    linguistic_features.append( lingustic_result)
    dataset_features.append({
        "bert_features" : word_embeddings,
        "linguistic_features" : lingustic_result,
        "label" : row['label']
    })
  except Exception as e:
    print(e)
    continue

# Convert to DataFrame
df = pd.DataFrame(dataset_features)

# Write to JSON file
dataset_file_path = '/content/drive/My Drive/dataset/dataset.json'
df.to_json(dataset_file_path, orient="records", indent=2)


empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words


In [None]:
dataset = pd.read_json(dataset_file_path, lines=False)
total_rows = dataset.shape[0]  # Returns the number of rows
print(total_rows)

42030


# Model Training

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np

In [None]:
dataset_file_path = '/content/drive/My Drive/dataset/dataset.json'
dataset = pd.read_json(dataset_file_path, lines=False)

# Step 2: Extract features and labels
x= []
y = []

for index, item in dataset.iterrows():
    try:
      bert = item["bert_features"][0]
      ling = item["linguistic_features"]
      features = bert + ling
      x.append(features)
      y.append(int(item["label"]))
    except Exception as e:
      print(index)
      print(item, index)
      print(e)


x = np.array(x)
y = np.array(y)


In [None]:
# Step 1: Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train_unscaled = x_train.copy()  # Create a copy of x_train
x_test_unscaled = x_test.copy()  # Create a copy of x_test

# Step 2: Scale the data using StandardScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)  # Fit on training data and transform
x_test = scaler.transform(x_test)  # Only transform test data

# Step 3: Train SVM
svm = SVC(
    kernel="linear",
    tol=1e-3,  # Slightly increase tolerance for faster convergence (default: 1e-3)
    max_iter=-1,  # -1 means no limit (let it run until convergence)
    random_state=42,
    cache_size=2000,  # Helps speed up training
    C=1.0  # Default regularization strength (adjust if needed)
)
svm.fit(x_train, y_train)

# Step 4: Evaluate
y_pred = svm.predict(x_test)

In [None]:
import json
# Get the weight vector w
w = svm.coef_[0].tolist()

# Get the bias (intercept) b
b = svm.intercept_[0]

print("Weight vector w:", svm.coef_[0])
print("Weight vector w len:", len(w))
print("Bias b:", b)
coff={
    'w':w,
    'b':b
}
coff_export_path = '/content/drive/My Drive/dataset/coff.json'

with open(coff_export_path, 'w') as file:
    json.dump(coff, file)

Weight vector w: [-0.25637944 -0.44239671  0.22775711 ...  1.81478338 -0.50005211
  3.55150252]
Weight vector w len: 1039
Bias b: -5.393439475786247


In [None]:


# Example for classification
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'macro' or 'micro' if needed
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")




Accuracy: 0.9111
Precision: 0.9112
Recall: 0.9111
F1 Score: 0.9111
