In [None]:
import pandas as pd

# Load your dataset from a CSV file
dataset_file = "sardet.csv"
df = pd.read_csv(dataset_file)

# Display the first few rows of the dataset to get an idea of the data
print(df.head())

# Data Preprocessing
def preprocess_text(text):
    # You can perform various text preprocessing steps here, such as removing special characters, URLs, and lowercasing.
    # For simplicity, we'll just convert text to lowercase.
    return text.lower()

df['tweets'] = df['tweets'].apply(preprocess_text)

# Display the preprocessed dataset
print(df.head())


                                              tweets  label
0                     I love working midnights tweet      1
1  I hate when I buy a bag of air and there's chi...      1
2  my grandad always sounds so ill when i speak t...      0
3  I realize I'm annoying to everyone, so I won't...      0
4  I love when I find these dudes on vine!! #Foll...      1
                                              tweets  label
0                     i love working midnights tweet      1
1  i hate when i buy a bag of air and there's chi...      1
2  my grandad always sounds so ill when i speak t...      0
3  i realize i'm annoying to everyone, so i won't...      0
4  i love when i find these dudes on vine!! #foll...      1


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and obtain embeddings for each tweet
def tokenize_and_embed(text):
    # Tokenize the text
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Obtain the embeddings from the model
    with torch.no_grad():
        outputs = model(**tokens)

    # Get the embeddings for the [CLS] token (or [SEP] if you prefer)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    return embeddings

# Apply tokenization and embedding to the 'tweets' column in your DataFrame
df['embeddings'] = df['tweets'].apply(tokenize_and_embed)

# Now, df['embeddings'] contains the BERT embeddings for each tweet
# You can convert this to a numpy array for use in machine learning models
embeddings = np.vstack(df['embeddings'].to_numpy())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Step 4: Feature Engineering - No additional feature engineering is required for this example.

# Step 5: Model Selection
# Split the data into training and testing sets
X = embeddings  # Features
y = df['label']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a machine learning model (Logistic Regression in this example)
model = LogisticRegression(random_state=42)

# Step 6: Training and Validation
model.fit(X_train, y_train)

# Step 7: Hyperparameter Tuning - You can perform hyperparameter tuning here if necessary.

# Step 8: Evaluation and Testing
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

# Print classification report for more detailed evaluation
print(classification_report(y_test, y_pred))


Accuracy: 0.90
F1 Score: 0.90
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       203
           1       0.90      0.90      0.90       196

    accuracy                           0.90       399
   macro avg       0.90      0.90      0.90       399
weighted avg       0.90      0.90      0.90       399



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import joblib

# Assuming 'model' is your trained machine learning model
trained_model = model

# Specify the filename for your joblib file
joblib_filename = 'sarcasm_detection_model.joblib'

# Save the trained model to the joblib file
joblib.dump(trained_model, joblib_filename)

print(f"Model saved to {joblib_filename}")


Model saved to sarcasm_detection_model.joblib


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import joblib
import numpy as np

# Load the trained LogisticRegression model
loaded_model = joblib.load('sarcasm_detection_model.joblib')

# Assuming you have the tokenizer and model for BERT as shown in previous steps
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def predict_sarcasm(input_text, model, tokenizer):
    # Tokenize the input text
    tokens = tokenizer(input_text, padding=True, truncation=True, return_tensors='pt')

    # Obtain BERT embeddings for the input text
    with torch.no_grad():
        outputs = model(**tokens)

    # Get the embeddings for the [CLS] token (or [SEP] if you prefer)
    input_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Use the loaded LogisticRegression model to make a prediction
    prediction = loaded_model.predict([input_embeddings])[0]

    return prediction

input_text = input("Enter a text: ")
prediction = predict_sarcasm(input_text, model, tokenizer)

if prediction == 1:
    print("The text is sarcastic.")
else:
    print("The text is not sarcastic.")


Enter a text: Oh, the Ultra-Healthy Chocolate-Covered Broccoli – a masterpiece of culinary contradiction! Who needs regular chocolate when you can have this guilt-free delicacy? I mean, who even likes indulgence when you can have the pleasure of eating something green and healthy disguised under a layer of chocolate?  The moment I took a bite of this prodigious creation, my taste buds did a celebratory conga dance. The satisfying crunch of broccoli mingled with the rich, velvety chocolate – what an unmatched symphony of flavors. Forget about those boring old chocolates that don't contribute to your daily dose of vitamins and minerals. This is the epitome of "having it all."
The text is sarcastic.
