In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data  = pd.read_excel('jsondataset.xlsx')

In [3]:
data.columns

Index(['template_name', 'category', 'json'], dtype='object')

# **train and test**

In [4]:
# Create DataFrame
# df = pd.DataFrame(data)
df = data

# Combine features and target variable
X = df['template_name'] + ' ' + df['category']  # Combine features
y = df['json']

# Check lengths
print(f"Length of X: {len(X)}, Length of y: {len(y)}")  # Should be the same

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Check shape of the vectorized matrix
print(f"Shape of X_vectorized: {X_vectorized.shape}")  # Should match number of samples

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Verify sizes after split
print(f"X_train size: {X_train.shape}, y_train size: {len(y_train)}")
print(f"X_test size: {X_test.shape}, y_test size: {len(y_test)}")


Length of X: 120, Length of y: 120
Shape of X_vectorized: (120, 182)
X_train size: (96, 182), y_train size: 96
X_test size: (24, 182), y_test size: 24


In [None]:
# X_train
np.save('X_train.npy', X_train)


## **logistics regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


## **random classfier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample data (make sure to replace this with your actual dataset)
# X = ["login_alert", "new_features_update", ...]  # your template names or texts
# y = ["json1", "json2", ...]  # your JSON outputs

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate model performance
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


## **SVM - Support Vector Machine**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample data (make sure to replace this with your actual dataset)
# X = ["login_alert", "new_features_update", ...]  # your template names or texts
# y = ["json1", "json2", ...]  # your JSON outputs

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Initialize Support Vector Classifier
svm_model = SVC(kernel='linear', random_state=42)  # You can try other kernels too (e.g., 'rbf')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate model performance
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


## **Gradient Boosting using XGBoost**

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample data (make sure to replace this with your actual dataset)
# X = ["login_alert", "new_features_update", ...]  # your template names or texts
# y = ["json1", "json2", ...]  # your JSON outputs

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Convert to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters
params = {
    'objective': 'multi:softmax',  # Specify multiclass classification
    'num_class': len(set(y)),       # Number of classes
    'max_depth': 3,                 # Depth of trees
    'eta': 0.1,                     # Learning rate
    'eval_metric': 'mlogloss',      # Evaluation metric
}

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(dtest)

# Evaluate model performance
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


## **Gradient Boosting using lightgbm**

In [None]:
pip install lightbgm

In [None]:
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample data (make sure to replace this with your actual dataset)
# X = ["login_alert", "new_features_update", ...]  # your template names or texts
# y = ["json1", "json2", ...]  # your JSON outputs

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Create LightGBM dataset
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

# Set parameters
params = {
    'objective': 'multiclass',         # Specify multiclass classification
    'num_class': len(set(y)),          # Number of classes
    'metric': 'multi_logloss',         # Evaluation metric
}

# Train the model
lgb_model = lgb.train(params, lgb_train, num_boost_round=100)

# Make predictions on the test set
y_pred_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
y_pred_lgb = [np.argmax(x) for x in y_pred_lgb]  # Get the index of the max probability

# Evaluate model performance
print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb))


## **Neural Network using Keras**

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow import keras

# Sample data (make sure to replace this with your actual dataset)
# X = ["login_alert", "new_features_update", ...]  # your template names or texts
# y = ["json1", "json2", ...]  # your JSON outputs (complex structures)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_vectorized = vectorizer.fit_transform(X).toarray()  # Convert to array for Keras

# Convert complex labels (y) into integer labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # y_encoded will contain integer labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# Convert labels to categorical (one-hot encoding)
num_classes = len(set(y_encoded))
y_train_categorical = keras.utils.to_categorical(y_train, num_classes)
y_test_categorical = keras.utils.to_categorical(y_test, num_classes)

# Build the model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_categorical, epochs=500, batch_size=32, validation_split=0.2)

# Make predictions on the test set
y_pred_nn_prob = model.predict(X_test)
y_pred_nn = tf.argmax(y_pred_nn_prob, axis=1).numpy()  # Get the predicted class

# Evaluate model performance
from sklearn.metrics import accuracy_score, classification_report

print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_nn))
print(classification_report(y_test, y_pred_nn))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 242ms/step - accuracy: 0.0000e+00 - loss: 4.7627 - val_accuracy: 0.0500 - val_loss: 4.7624
Epoch 2/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0602 - loss: 4.7432 - val_accuracy: 0.0500 - val_loss: 4.7683
Epoch 3/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.1139 - loss: 4.7262 - val_accuracy: 0.0000e+00 - val_loss: 4.7738
Epoch 4/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.1571 - loss: 4.7095 - val_accuracy: 0.0000e+00 - val_loss: 4.7796
Epoch 5/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.2185 - loss: 4.6898 - val_accuracy: 0.0000e+00 - val_loss: 4.7858
Epoch 6/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.2095 - loss: 4.6735 - val_accuracy: 0.0000e+00 - val_loss: 4.7926
Epoch 7/500
[1m3/3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Example of taking input from user:
user_input = input("Enter a template name or text: ")

# Step 1: Vectorize the user input
user_input_vectorized = vectorizer.transform([user_input]).toarray()

# Step 2: Predict the JSON category
user_input_pred_prob = model.predict(user_input_vectorized)
user_input_pred_class = tf.argmax(user_input_pred_prob, axis=1).numpy()

# Step 3: Get the predicted JSON output
predicted_json_output = label_encoder.inverse_transform(user_input_pred_class)

# Output
print("Predicted JSON for your input:", predicted_json_output[0])

Enter a template name or text:  hbrhbnh


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
Predicted JSON for your input: {
  "name": "Promotional Offer",
  "category": "MARKETING",
  "allow_category_change": false,
  "language": "en_US",
  "components": [
    {
      "type": "BODY",
      "text": "🎉 Great news, {{1}}! Enjoy a **20% discount** on your next purchase with code **SAVE20**. Hurry, this offer expires on {{2}}!",
      "example": {
        "body_text": [
          ["Alice", "October 31, 2024"]
        ]
      }
    },
    {
      "type": "BUTTONS",
      "buttons": [
        {
          "type": "URL",
          "text": "Shop Now",
          "url": "https://www.yourstore.com/shop"
        },
        {
          "type": "PHONE_NUMBER",
          "text": "Contact Us",
          "phone_number": "+1234567890"
        }
      ]
    }
  ]
}


In [7]:
import joblib
# After training and evaluation
model.save('my_neural_network_model.h5')  # Save the model
joblib.dump(label_encoder, 'label_encoder.pkl')  # Save the label encoder
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')  # save the vectorizer



['tfidf_vectorizer.pkl']