Data processing and Random Forest Classification Train/Test


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your dataset
from google.colab import drive
drive.mount('/content/drive')
filepath = "/content/drive/MyDrive/COMP 488 Zillow/SyntheticExpert/FinalFolder/Labeled_Training_Sample.csv"
data = pd.read_csv(filepath)
data.info()

def standardize_labels(label):
    # Make label lowercase
    label = label.lower()
    # Remove trailing periods
    if label.endswith('.'):
        label = label[:-1]
    return label

# Apply the function to standardize labels
data['Classification'] = data['Classification'].apply(standardize_labels)
data['Classification'] = data['Classification'].replace('event promotion', 'brand awareness')
data['Classification'] = data['Classification'].replace('this ad can be classified as a sales promotion', 'sales promotion')

print(data['Classification'].unique())
print(data['Classification'].value_counts())


# Prepare the data
X = data['Ad Description1']  # Feature
y = data['Classification']   # Labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7597 entries, 0 to 7596
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Product/Company  7597 non-null   object
 1   Ad Description1  7597 non-null   object
 2   Classification   7597 non-null   object
dtypes: object(3)
memory usage: 178.2+ KB
['product promotion' 'service promotion' 'sales promotion'
 'lead generation' 'brand awareness']
Classification
product promotion    3207
sales promotion      3000
service promotion     705
brand awareness       537
lead generation       148
Name: count, dtype: int64
                   precision    recall  f1-score   support

  brand awareness       0.72      0.68      0.70       152
  lead generation       0.63      0.66      0.65        47
product promotion       0.85      0.89      0.87       983
  sales promotion       0.91      0.88      0.89       874
service promotion       0.78      0.7

In [None]:
print(data.columns)

Index(['Product/Company', 'Ad Description1', 'Classification'], dtype='object')


SVM Classification Test

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Prepare the data
X = data['Ad Description1']  # Feature column
y = data['Classification']   # Label column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# We'll use a pipeline to streamline the vectorization and classification steps
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    StandardScaler(with_mean=False),  # Important to set with_mean=False for sparse data compatibility
    SVC(kernel='linear', C=1)  # Using a linear kernel since it's often effective for text
)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))


                   precision    recall  f1-score   support

  brand awareness       0.63      0.68      0.65       152
  lead generation       0.63      0.72      0.67        47
product promotion       0.85      0.83      0.84       983
  sales promotion       0.87      0.86      0.87       874
service promotion       0.73      0.77      0.75       224

         accuracy                           0.82      2280
        macro avg       0.74      0.77      0.76      2280
     weighted avg       0.83      0.82      0.82      2280



In [None]:
#SVM and Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# RandomForest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Classifier Report")
print(classification_report(y_test, rf_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

# Feature importance (assuming features are words in this case)
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index = vectorizer.get_feature_names_out(),
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))

# Support Vector Machine Classifier with a pipeline
pipeline = make_pipeline(
    StandardScaler(with_mean=False),  # Necessary for handling sparse matrix
    SVC(kernel='linear', C=1)
)

pipeline.fit(X_train, y_train)
svc_predictions = pipeline.predict(X_test)
print("SVM Classifier Report")
print(classification_report(y_test, svc_predictions))

# Cross-validation to assess model reliability
rf_cross_val_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Random Forest Cross-Validation Scores:", rf_cross_val_scores)

svc_cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print("SVM Cross-Validation Scores:", svc_cross_val_scores)


Random Forest Classifier Report
                   precision    recall  f1-score   support

  brand awareness       0.72      0.67      0.70       152
  lead generation       0.65      0.66      0.65        47
product promotion       0.86      0.89      0.87       983
  sales promotion       0.90      0.89      0.90       874
service promotion       0.78      0.76      0.77       224

         accuracy                           0.86      2280
        macro avg       0.78      0.77      0.78      2280
     weighted avg       0.86      0.86      0.86      2280

Confusion Matrix:
[[102   0  28   5  17]
 [  1  31   6   5   4]
 [ 21   4 872  66  20]
 [  6   8  78 776   6]
 [ 11   5  31   7 170]]
        importance
off       0.049606
save      0.023278
free      0.019034
50        0.018455
low       0.018380
today     0.018373
shop      0.017022
on        0.016462
35        0.015137
online    0.013957
SVM Classifier Report
                   precision    recall  f1-score   support

  brand a

In [None]:
import joblib
# Save the model
joblib.dump(rf_model, 'random_forest_model.joblib')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

In [None]:
def load_resources():
    # Load the saved model and vectorizer
    model = joblib.load('random_forest_model.joblib')
    vectorizer = joblib.load('tfidf_vectorizer.joblib')
    return model, vectorizer

def predict_ad_type(ad_text, model, vectorizer):
    # Vectorize the input text
    ad_text_transformed = vectorizer.transform([ad_text])

    # Predict using the loaded model
    ad_type = model.predict(ad_text_transformed)
    return ad_type[0]  # Assuming the result is a single category

# Load resources once
model, vectorizer = load_resources()

# Example usage
ad_text = input("Enter the ad text: ")
predicted_type = predict_ad_type(ad_text, model, vectorizer)
print(f"The ad type is predicted to be: {predicted_type}")


Enter the ad text: 35% Off Today and Fast, Free Shipping. Pet Food & Supplies from 3,000+ Brands. Shop Today! Low Prices.
The ad type is predicted to be: sales promotion


In [None]:
from transformers import AutoTokenizer

# Specify the model identifier
model_name = "roberta-base"  # Example model, adjust based on your choice

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Naives Bayes

In [None]:
#Naives Bayes
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Function to preprocess text data with Part-of-Speech tagging
def preprocess(text):
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    # Part-of-Speech tagging and keeping adjectives and nouns
    tagged_tokens = pos_tag(lemmatized_tokens)
    cleaned_text = ' '.join([word for word, tag in tagged_tokens if tag.startswith(('NN', 'JJ')) and word not in stop_words])
    return cleaned_text

# Load your data
nb_data = data
nb_data['text'] = data['Ad Description1']  # Assuming this is the text column
nb_data['label'] = data['Classification']  # Assuming this is the label column

# Preprocess the text data
nb_data['text'] = nb_data['text'].apply(preprocess)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(nb_data['text'], nb_data['label'], test_size=0.3, random_state=42)

# Set up the pipeline with TfidfVectorizer and MultinomialNB
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', analyzer='word', ngram_range=(1, 3), max_df=0.75, min_df=1),
    MultinomialNB()
)

# Define the parameter grid for MultinomialNB
param_grid = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.5, 0.75, 1.0],
    'tfidfvectorizer__min_df': [1, 2, 3],
    'multinomialnb__alpha': [0.01, 0.1, 1, 10]
}

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best model
nb_model = grid_search.best_estimator_

# Predict and evaluate
predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", nb_accuracy)
print(classification_report(y_test, predictions))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Accuracy: 0.8350877192982457
                   precision    recall  f1-score   support

  brand awareness       0.77      0.64      0.70       152
  lead generation       0.51      0.81      0.62        47
product promotion       0.87      0.84      0.85       983
  sales promotion       0.88      0.87      0.88       874
service promotion       0.68      0.79      0.73       224

         accuracy                           0.84      2280
        macro avg       0.74      0.79      0.76      2280
     weighted avg       0.84      0.84      0.84      2280



Bert

In [None]:
!pip install transformers[torch]
!pip install datasets




In [None]:
#Bert Model
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load your data here
# Assuming your data is already loaded into a DataFrame named 'data'
# Ensure that your DataFrame has columns named 'Ad Description1' and 'Classification'
# If your columns have different names, adjust the column selection accordingly

# Correct column selection and renaming
bert_data = data[['Ad Description1', 'Classification']]
bert_data.columns = ['text', 'label']

# Convert labels to integers
label_map = {label: i for i, label in enumerate(bert_data['label'].unique())}
bert_data['label'] = bert_data['label'].map(label_map)

# Split the data into train and test sets
train_data, test_data = train_test_split(bert_data, test_size=0.3, random_state=42)

# Convert pandas DataFrame to datasets.Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Training
training_args = TrainingArguments(
    "test_trainer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,  # Adjust learning rate
    per_device_train_batch_size=8,  # Adjust batch size
    num_train_epochs=3,  # Adjust number of epochs
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="epoch",  # Update save strategy
)
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(y_true=p.label_ids, y_pred=p.predictions.argmax(axis=1))},
)

trainer.train()

# Evaluation
eval_results = trainer.evaluate()

# Calculate accuracy score
bert_accuracy = eval_results["eval_accuracy"]
print("Accuracy Score:", bert_accuracy)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert_data['label'] = bert_data['label'].map(label_map)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/5317 [00:00<?, ? examples/s]

Map:   0%|          | 0/2280 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6421,0.488076,0.846491
2,0.4202,0.446027,0.854386
3,0.3795,0.449383,0.860088


Accuracy Score: 0.8600877192982456


Synthetic Expert First Draft


In [None]:

bert_model.save_pretrained("bert_model")
tokenizer.save_pretrained( "bert_tokenizer")

# Example of loading resources and making predictions
def load_resources(model_path, tokenizer_path):
    # Load the saved model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

def predict_ad_type(ad_text, model, tokenizer):
    # Tokenize the input text
    inputs = tokenizer(ad_text, return_tensors="pt", padding=True, truncation=True)

    # Make predictions using the loaded model
    outputs = model(**inputs)
    predicted_label_idx = outputs.logits.argmax().item()

    # Map label index back to label
    label_map_inverse = {v: k for k, v in label_map.items()}
    predicted_label = label_map_inverse[predicted_label_idx]
    return predicted_label

# Load resources once
loaded_model, loaded_tokenizer = load_resources("bert_model", "bert_tokenizer")

# Example usage
ad_text = input("Enter the ad text: ")
predicted_type = predict_ad_type(ad_text, loaded_model, loaded_tokenizer)
print(f"The ad type is predicted to be: {predicted_type}")

Enter the ad text: 35% Off Today and Fast, Free Shipping. Pet Food & Supplies from 3,000+ Brands. Shop Today! Low Prices.
The ad type is predicted to be: sales promotion


Save Model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Let's assume your model is bert_model and tokenizer is tokenizer
model_path = "/content/drive/My Drive/BertModel"  # Path to save on Google Drive
tokenizer_path = "/content/drive/My Drive/BertModel"  # Path to save on Google Drive

# Save the model
bert_model.save_pretrained(model_path)
# Save the tokenizer
tokenizer.save_pretrained(tokenizer_path)

('/content/drive/My Drive/BertModel/tokenizer_config.json',
 '/content/drive/My Drive/BertModel/special_tokens_map.json',
 '/content/drive/My Drive/BertModel/vocab.txt',
 '/content/drive/My Drive/BertModel/added_tokens.json')