In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.




# ML Model for Malayalam

In [None]:
  # Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load datasets
malayalam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
malayalam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')

# Preprocess data with dynamic column name handling
def preprocess(df):
    if 'LABEL' in df.columns:
        label_column = 'LABEL'
    elif 'Label' in df.columns:
        label_column = 'Label'
    else:
        raise KeyError("Label column not found. Available columns: " + ", ".join(df.columns))

    df['Label'] = df[label_column].str.lower()  # Adjust column name dynamically
    df['Data'] = df['DATA'].str.strip()  # Adjust as per your dataset's actual data column name
    return df[['Data', 'Label']]

malayalam_train = preprocess(malayalam_train)
malayalam_test = preprocess(malayalam_test)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(malayalam_train['Data'])
X_test = vectorizer.transform(malayalam_test['Data'])

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(malayalam_train['Label'])
y_test = encoder.transform(malayalam_test['Label'])

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Kernel SVM": SVC(kernel='rbf'),  # Using RBF kernel for non-linear SVM
    "SGD": SGDClassifier()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()  # Extract true negatives, false positives, false negatives, true positives

    # Calculate specificity
    specificity = tn / (tn + fp)

    # Calculate G-mean
    g_mean = np.sqrt(recall * specificity)

    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision (Macro)": precision,
        "Recall (Macro)": recall,
        "Macro F1 Score": f1,
        "G-mean": g_mean
    }

# Output results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")


Logistic Regression:
  Accuracy: 0.67
  Precision (Macro): 0.67
  Recall (Macro): 0.67
  Macro F1 Score: 0.66
  G-mean: 0.64
SVM:
  Accuracy: 0.67
  Precision (Macro): 0.67
  Recall (Macro): 0.67
  Macro F1 Score: 0.66
  G-mean: 0.63
Random Forest:
  Accuracy: 0.65
  Precision (Macro): 0.65
  Recall (Macro): 0.65
  Macro F1 Score: 0.65
  G-mean: 0.61
Naive Bayes:
  Accuracy: 0.62
  Precision (Macro): 0.63
  Recall (Macro): 0.62
  Macro F1 Score: 0.62
  G-mean: 0.59
Decision Tree:
  Accuracy: 0.57
  Precision (Macro): 0.58
  Recall (Macro): 0.57
  Macro F1 Score: 0.57
  G-mean: 0.59
Kernel SVM:
  Accuracy: 0.67
  Precision (Macro): 0.67
  Recall (Macro): 0.67
  Macro F1 Score: 0.66
  G-mean: 0.63
SGD:
  Accuracy: 0.69
  Precision (Macro): 0.69
  Recall (Macro): 0.69
  Macro F1 Score: 0.69
  G-mean: 0.67


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Load datasets
malayalam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
malayalam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')


# Preprocess data with dynamic column name handling
def preprocess(df):
    if 'LABEL' in df.columns:
        label_column = 'LABEL'
    elif 'Label' in df.columns:
        label_column = 'Label'
    else:
        raise KeyError("Label column not found. Available columns: " + ", ".join(df.columns))

    df['Label'] = df[label_column].str.lower()  # Adjust column name dynamically
    df['Data'] = df['DATA'].str.strip()  # Adjust as per your dataset's actual data column name
    return df[['Data', 'Label']]

malayalam_train = preprocess(malayalam_train)
malayalam_test = preprocess(malayalam_test)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(malayalam_train['Data'])
X_test = vectorizer.transform(malayalam_test['Data'])

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(malayalam_train['Label'])
y_test = encoder.transform(malayalam_test['Label'])

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Output results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")


Logistic Regression:
  Accuracy: 0.67
  Precision: 0.65
  Recall: 0.71
  F1 Score: 0.68
SVM:
  Accuracy: 0.67
  Precision: 0.64
  Recall: 0.74
  F1 Score: 0.69
Random Forest:
  Accuracy: 0.68
  Precision: 0.64
  Recall: 0.79
  F1 Score: 0.71
Naive Bayes:
  Accuracy: 0.62
  Precision: 0.61
  Recall: 0.70
  F1 Score: 0.65


# ML Model for Tamil

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load datasets
tamil_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tamil_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')

# Preprocess data with dynamic column name handling
def preprocess(df, data_column, label_column_options):
    # Detect and use the correct label column
    for label_column in label_column_options:
        if label_column in df.columns:
            df['Label'] = df[label_column].str.lower()
            break
    else:
        raise KeyError("None of the label columns found. Available columns: " + ", ".join(df.columns))

    # Ensure the data column exists
    if data_column in df.columns:
        df['Data'] = df[data_column].str.strip()
    else:
        raise KeyError(f"Data column '{data_column}' not found. Available columns: " + ", ".join(df.columns))

    return df[['Data', 'Label']]

# Apply preprocessing
tamil_train = preprocess(tamil_train, 'DATA', ['LABEL', 'Label'])
tamil_test = preprocess(tamil_test, 'Data', ['Label'])  # Adjust the data column name if different

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(tamil_train['Data'])
X_test = vectorizer.transform(tamil_test['Data'])

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(tamil_train['Label'])
y_test = encoder.transform(tamil_test['Label'])

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Kernel SVM": SVC(kernel='rbf'),  # Using RBF kernel for non-linear SVM
    "SGD": SGDClassifier()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy, precision, recall, and F1 score (macro)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()  # Extract true negatives, false positives, false negatives, true positives

    # Calculate specificity
    specificity = tn / (tn + fp)

    # Calculate G-mean
    g_mean = np.sqrt(recall * specificity)

    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision (Macro)": precision,
        "Recall (Macro)": recall,
        "Macro F1 Score": f1,
        "G-mean": g_mean
    }

# Output results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")


Logistic Regression:
  Accuracy: 0.84
  Precision (Macro): 0.84
  Recall (Macro): 0.84
  Macro F1 Score: 0.84
  G-mean: 0.86
SVM:
  Accuracy: 0.80
  Precision (Macro): 0.80
  Recall (Macro): 0.80
  Macro F1 Score: 0.80
  G-mean: 0.83
Random Forest:
  Accuracy: 0.87
  Precision (Macro): 0.87
  Recall (Macro): 0.87
  Macro F1 Score: 0.87
  G-mean: 0.87
Naive Bayes:
  Accuracy: 0.68
  Precision (Macro): 0.70
  Recall (Macro): 0.69
  Macro F1 Score: 0.68
  G-mean: 0.75
Decision Tree:
  Accuracy: 0.72
  Precision (Macro): 0.72
  Recall (Macro): 0.72
  Macro F1 Score: 0.72
  G-mean: 0.75
Kernel SVM:
  Accuracy: 0.80
  Precision (Macro): 0.80
  Recall (Macro): 0.80
  Macro F1 Score: 0.80
  G-mean: 0.83
SGD:
  Accuracy: 0.84
  Precision (Macro): 0.84
  Recall (Macro): 0.84
  Macro F1 Score: 0.84
  G-mean: 0.86


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Load datasets
#mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
#mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')

tamil_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tamil_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')

#tamil_train = pd.read_csv('../input/final-dataset/tam_training_data_hum_ai.csv')
#tamil_test = pd.read_excel('../input/final-dataset/tamil-test.xlsx')

# Preprocess data with dynamic column name handling
def preprocess(df, data_column, label_column_options):
    # Detect and use the correct label column
    for label_column in label_column_options:
        if label_column in df.columns:
            df['Label'] = df[label_column].str.lower()
            break
    else:
        raise KeyError("None of the label columns found. Available columns: " + ", ".join(df.columns))

    # Ensure the data column exists
    if data_column in df.columns:
        df['Data'] = df[data_column].str.strip()
    else:
        raise KeyError(f"Data column '{data_column}' not found. Available columns: " + ", ".join(df.columns))

    return df[['Data', 'Label']]

# Apply preprocessing
tamil_train = preprocess(tamil_train, 'DATA', ['LABEL', 'Label'])
tamil_test = preprocess(tamil_test, 'Data', ['Label'])  # Adjust the data column name if different

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(tamil_train['Data'])
X_test = vectorizer.transform(tamil_test['Data'])

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(tamil_train['Label'])
y_test = encoder.transform(tamil_test['Label'])

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Output results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")


Logistic Regression:
  Accuracy: 0.84
  Precision: 0.88
  Recall: 0.81
  F1 Score: 0.84
SVM:
  Accuracy: 0.80
  Precision: 0.85
  Recall: 0.75
  F1 Score: 0.80
Random Forest:
  Accuracy: 0.86
  Precision: 0.88
  Recall: 0.85
  F1 Score: 0.86
Naive Bayes:
  Accuracy: 0.68
  Precision: 0.76
  Recall: 0.56
  F1 Score: 0.64


In [None]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Parameters setup
vocab_size = 10000
max_length = 200
embedding_dim = 128
oov_tok = "<OOV>"

# Load datasets
mal_train = pd.read_csv('/kaggle/input/final-dataset/mal_training_data_hum_ai.csv')
mal_test = pd.read_excel('/kaggle/input/final-dataset/mal_test.xlsx')
tam_train = pd.read_csv('/kaggle/input/final-dataset/tam_training_data_hum_ai.csv')
tam_test = pd.read_excel('/kaggle/input/final-dataset/tamil-test.xlsx')

# Update label mapping to include all variations and potential cases
label_mapping = {'human': 0, 'ai': 1, 'Human': 0, 'AI': 1, 'HUMAN': 0, 'AI': 1}

def convert_labels(df, label_col):
    try:
        # Normalize labels to lowercase and convert using mapping
        converted_labels = [label_mapping[label.lower()] for label in df[label_col]]
    except KeyError as e:
        missing_label = str(e).strip("'")
        raise ValueError(f"Unmapped label found: {missing_label}. Please update the label_mapping.")
    return np.array(converted_labels)

# Apply preprocessing and convert labels
try:
    mal_train['LABEL'] = convert_labels(mal_train, 'LABEL')
    mal_test['Label'] = convert_labels(mal_test, 'Label')
    tam_train['LABEL'] = convert_labels(tam_train, 'LABEL')
    tam_test['Label'] = convert_labels(tam_test, 'Label')
except ValueError as e:
    print(e)  # For debugging any further issues with labels

# Combine text data for tokenization
all_text = pd.concat([mal_train['DATA'], mal_test['DATA'], tam_train['DATA'], tam_test['Data']])

# Tokenization and padding sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(all_text)
train_padded_mal = tokenizer.texts_to_sequences(mal_train['DATA'])
train_padded_mal = pad_sequences(train_padded_mal, maxlen=max_length)
test_padded_mal = tokenizer.texts_to_sequences(mal_test['DATA'])
test_padded_mal = pad_sequences(test_padded_mal, maxlen=max_length)

train_padded_tam = tokenizer.texts_to_sequences(tam_train['DATA'])
train_padded_tam = pad_sequences(train_padded_tam, maxlen=max_length)
test_padded_tam = tokenizer.texts_to_sequences(tam_test['Data'])
test_padded_tam = pad_sequences(test_padded_tam, maxlen=max_length)

# Define and compile a simple model
def create_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train and evaluate the model
def train_and_evaluate(train_padded, train_labels, test_padded, test_labels):
    model = create_model()
    model.fit(train_padded, train_labels, epochs=10, verbose=2)
    predictions = (model.predict(test_padded) > 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='binary')
    accuracy = accuracy_score(test_labels, predictions)
    return precision, recall, f1, accuracy

# Evaluate models for Malayalam
precision_mal, recall_mal, f1_mal, accuracy_mal = train_and_evaluate(train_padded_mal, mal_train['LABEL'], test_padded_mal, mal_test['Label'])
print(f"Malayalam - Precision: {precision_mal}, Recall: {recall_mal}, F1: {f1_mal}, Accuracy: {accuracy_mal}")

# Evaluate models for Tamil
precision_tam, recall_tam, f1_tam, accuracy_tam = train_and_evaluate(train_padded_tam, tam_train['LABEL'], test_padded_tam, tam_test['Label'])
print(f"Tamil - Precision: {precision_tam}, Recall: {recall_tam}, F1: {f1_tam}, Accuracy: {accuracy_tam}")




Epoch 1/10
25/25 - 3s - 110ms/step - accuracy: 0.5312 - loss: 0.6928
Epoch 2/10
25/25 - 1s - 39ms/step - accuracy: 0.5663 - loss: 0.6911
Epoch 3/10
25/25 - 1s - 39ms/step - accuracy: 0.7412 - loss: 0.6484
Epoch 4/10
25/25 - 1s - 39ms/step - accuracy: 0.7600 - loss: 0.5464
Epoch 5/10
25/25 - 1s - 39ms/step - accuracy: 0.8500 - loss: 0.3968
Epoch 6/10
25/25 - 1s - 51ms/step - accuracy: 0.8963 - loss: 0.2905
Epoch 7/10
25/25 - 1s - 39ms/step - accuracy: 0.9325 - loss: 0.1983
Epoch 8/10
25/25 - 1s - 40ms/step - accuracy: 0.9550 - loss: 0.1448
Epoch 9/10
25/25 - 1s - 39ms/step - accuracy: 0.9712 - loss: 0.1148
Epoch 10/10
25/25 - 1s - 39ms/step - accuracy: 0.9750 - loss: 0.0941
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Malayalam - Precision: 0.7582417582417582, Recall: 0.69, F1: 0.7225130890052356, Accuracy: 0.735
Epoch 1/10




26/26 - 3s - 97ms/step - accuracy: 0.4988 - loss: 0.6949
Epoch 2/10
26/26 - 1s - 38ms/step - accuracy: 0.4765 - loss: 0.6959
Epoch 3/10
26/26 - 1s - 38ms/step - accuracy: 0.4641 - loss: 0.6945
Epoch 4/10
26/26 - 1s - 39ms/step - accuracy: 0.7178 - loss: 0.6864
Epoch 5/10
26/26 - 1s - 38ms/step - accuracy: 0.7426 - loss: 0.6469
Epoch 6/10
26/26 - 1s - 39ms/step - accuracy: 0.8589 - loss: 0.4972
Epoch 7/10
26/26 - 1s - 39ms/step - accuracy: 0.8676 - loss: 0.2985
Epoch 8/10
26/26 - 1s - 39ms/step - accuracy: 0.9233 - loss: 0.1900
Epoch 9/10
26/26 - 1s - 41ms/step - accuracy: 0.7673 - loss: 0.7899
Epoch 10/10
26/26 - 1s - 53ms/step - accuracy: 0.9455 - loss: 0.2096
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Tamil - Precision: 0.88, Recall: 0.9166666666666666, F1: 0.8979591836734694, Accuracy: 0.9


# DL for mala tam

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, GRU
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Parameters
vocab_size = 10000
max_length = 200
embedding_dim = 128
oov_tok = "<OOV>"

# Load datasets
mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')
tam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')

# Normalize labels
label_mapping = {'human': 0, 'ai': 1}
def convert_labels(labels):
    return np.array([label_mapping[label.lower()] for label in labels])

mal_train_labels = convert_labels(mal_train['LABEL'])
mal_test_labels = convert_labels(mal_test['Label'])
tam_train_labels = convert_labels(tam_train['LABEL'])
tam_test_labels = convert_labels(tam_test['Label'])

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(pd.concat([mal_train['DATA'], tam_train['DATA']]))  # Combine text for tokenizing
mal_train_seq = pad_sequences(tokenizer.texts_to_sequences(mal_train['DATA']), maxlen=max_length)
mal_test_seq = pad_sequences(tokenizer.texts_to_sequences(mal_test['DATA']), maxlen=max_length)
tam_train_seq = pad_sequences(tokenizer.texts_to_sequences(tam_train['DATA']), maxlen=max_length)
tam_test_seq = pad_sequences(tokenizer.texts_to_sequences(tam_test['Data']), maxlen=max_length)

# Model definitions
def create_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_gru_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim),
        GRU(128),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_cnn_lstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim),
        Conv1D(64, 5, activation='relu'),
        LSTM(64),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_cnn_bilstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim),
        Conv1D(64, 5, activation='relu'),
        Bidirectional(LSTM(64)),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

# Train and evaluate
def train_and_evaluate(model, train_data, train_labels, test_data, test_labels, language, model_name):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_data, train_labels, epochs=10, verbose=2)
    predictions = (model.predict(test_data) > 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='binary')
    accuracy = accuracy_score(test_labels, predictions)
    print(f"{language} - {model_name} - Precision: {precision}, Recall: {recall}, F1: {f1}, Accuracy: {accuracy}")

# Instantiate and evaluate models
models = {
    'CNN': create_cnn_model(),
    'GRU': create_gru_model(),
    'CNN-LSTM': create_cnn_lstm_model(),
    'CNN-BiLSTM': create_cnn_bilstm_model()
}

for model_name, model in models.items():
    print(f"\nEvaluating {model_name} model for Malayalam:")
    train_and_evaluate(model, mal_train_seq, mal_train_labels, mal_test_seq, mal_test_labels, "Malayalam", model_name)
    print(f"\nEvaluating {model_name} model for Tamil:")
    train_and_evaluate(model, tam_train_seq, tam_train_labels, tam_test_seq, tam_test_labels, "Tamil", model_name)



Evaluating CNN model for Malayalam:
Epoch 1/10




25/25 - 3s - 125ms/step - accuracy: 0.5200 - loss: 0.6938
Epoch 2/10
25/25 - 1s - 42ms/step - accuracy: 0.5238 - loss: 0.6932
Epoch 3/10
25/25 - 1s - 43ms/step - accuracy: 0.5987 - loss: 0.6865
Epoch 4/10
25/25 - 1s - 43ms/step - accuracy: 0.7987 - loss: 0.6326
Epoch 5/10
25/25 - 1s - 46ms/step - accuracy: 0.8612 - loss: 0.4601
Epoch 6/10
25/25 - 1s - 45ms/step - accuracy: 0.9013 - loss: 0.2989
Epoch 7/10
25/25 - 1s - 43ms/step - accuracy: 0.9150 - loss: 0.2237
Epoch 8/10
25/25 - 1s - 45ms/step - accuracy: 0.9513 - loss: 0.1473
Epoch 9/10
25/25 - 1s - 45ms/step - accuracy: 0.9762 - loss: 0.1037
Epoch 10/10
25/25 - 1s - 44ms/step - accuracy: 0.9737 - loss: 0.0759
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Malayalam - CNN - Precision: 0.7117117117117117, Recall: 0.79, F1: 0.7488151658767773, Accuracy: 0.735

Evaluating CNN model for Tamil:
Epoch 1/10
26/26 - 3s - 102ms/step - accuracy: 0.6423 - loss: 0.7152
Epoch 2/10
26/26 - 1s - 47ms/step - accuracy: 0.8181

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, Attention, Input
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

# Parameters
vocab_size = 10000
max_length = 200
embedding_dim = 100  # Embedding dimension
oov_tok = "<OOV>"

# Load datasets
mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')
tam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')

# Normalize labels
label_mapping = {'human': 0, 'ai': 1}
def convert_labels(labels):
    return np.array([label_mapping[label.lower()] for label in labels])

mal_train_labels = convert_labels(mal_train['LABEL'])
mal_test_labels = convert_labels(mal_test['Label'])
tam_train_labels = convert_labels(tam_train['LABEL'])
tam_test_labels = convert_labels(tam_test['Label'])

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(pd.concat([mal_train['DATA'], tam_train['DATA']]))  # Combine text for tokenizing
mal_train_seq = pad_sequences(tokenizer.texts_to_sequences(mal_train['DATA']), maxlen=max_length)
mal_test_seq = pad_sequences(tokenizer.texts_to_sequences(mal_test['DATA']), maxlen=max_length)
tam_train_seq = pad_sequences(tokenizer.texts_to_sequences(tam_train['DATA']), maxlen=max_length)
tam_test_seq = pad_sequences(tokenizer.texts_to_sequences(tam_test['Data']), maxlen=max_length)

# Model definitions
def create_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_bilstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64)),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_bilstm_attention_model():
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_layer)
    bilstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
    attention_layer = Attention()([bilstm_layer, bilstm_layer])
    attention_pooling = GlobalAveragePooling1D()(attention_layer)
    dense_layer = Dense(24, activation='relu')(attention_pooling)
    output_layer = Dense(1, activation='sigmoid')(dense_layer)
    return tf.keras.Model(inputs=input_layer, outputs=output_layer)

# Function to calculate G-mean
def calculate_gmean(conf_matrix):
    TN, FP, FN, TP = conf_matrix.ravel()
    sensitivity = TP / (TP + FN)  # Recall
    specificity = TN / (TN + FP)
    gmean = np.sqrt(sensitivity * specificity)
    return gmean

# Train and evaluate function
def train_and_evaluate(model, train_data, train_labels, test_data, test_labels, language, model_name):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_data, train_labels, epochs=10, verbose=2)
    predictions = (model.predict(test_data) > 0.5).astype(int)

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='binary')
    accuracy = accuracy_score(test_labels, predictions)
    conf_matrix = confusion_matrix(test_labels, predictions)
    gmean = calculate_gmean(conf_matrix)

    # Print the results
    print(f"{language} - {model_name}:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")
    print(f"Accuracy: {accuracy}")
    print(f"G-mean: {gmean}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("="*50)

# Instantiate and evaluate models
models = {
    'CNN': create_cnn_model(),
    'BiLSTM': create_bilstm_model(),
    'BiLSTM + Attention': create_bilstm_attention_model()
}

# Evaluate for Malayalam and Tamil
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} model for Malayalam:")
    train_and_evaluate(model, mal_train_seq, mal_train_labels, mal_test_seq, mal_test_labels, "Malayalam", model_name)
    print(f"\nEvaluating {model_name} model for Tamil:")
    train_and_evaluate(model, tam_train_seq, tam_train_labels, tam_test_seq, tam_test_labels, "Tamil", model_name)



Evaluating CNN model for Malayalam:
Epoch 1/10




25/25 - 3s - 105ms/step - accuracy: 0.4825 - loss: 0.6938
Epoch 2/10
25/25 - 1s - 33ms/step - accuracy: 0.5263 - loss: 0.6919
Epoch 3/10
25/25 - 1s - 34ms/step - accuracy: 0.6587 - loss: 0.6666
Epoch 4/10
25/25 - 1s - 52ms/step - accuracy: 0.8475 - loss: 0.5334
Epoch 5/10
25/25 - 1s - 39ms/step - accuracy: 0.8700 - loss: 0.3706
Epoch 6/10
25/25 - 1s - 39ms/step - accuracy: 0.9013 - loss: 0.2963
Epoch 7/10
25/25 - 1s - 38ms/step - accuracy: 0.9488 - loss: 0.1606
Epoch 8/10
25/25 - 1s - 36ms/step - accuracy: 0.9737 - loss: 0.1056
Epoch 9/10
25/25 - 1s - 37ms/step - accuracy: 0.9850 - loss: 0.0698
Epoch 10/10
25/25 - 1s - 40ms/step - accuracy: 0.9912 - loss: 0.0426
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Malayalam - CNN:
Precision: 0.7181818181818181
Recall: 0.79
F1: 0.7523809523809524
Accuracy: 0.74
G-mean: 0.7383088784512889
Confusion Matrix:
[[69 31]
 [21 79]]

Evaluating CNN model for Tamil:
Epoch 1/10
26/26 - 3s - 96ms/step - accuracy: 0.5198 - loss: 1

  _warn_prf(average, modifier, msg_start, len(result))


# Satisfy all conditions for DL models for mala tam

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, Attention, Input
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

# Parameters
vocab_size = 10000
max_length = 200
embedding_dim = 100  # Embedding dimension
oov_tok = "<OOV>"

# Load datasets
mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')
tam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')

# Normalize labels
label_mapping = {'human': 0, 'ai': 1}
def convert_labels(labels):
    return np.array([label_mapping[label.lower()] for label in labels])

mal_train_labels = convert_labels(mal_train['LABEL'])
mal_test_labels = convert_labels(mal_test['Label'])
tam_train_labels = convert_labels(tam_train['LABEL'])
tam_test_labels = convert_labels(tam_test['Label'])

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(pd.concat([mal_train['DATA'], tam_train['DATA']]))  # Combine text for tokenizing
mal_train_seq = pad_sequences(tokenizer.texts_to_sequences(mal_train['DATA']), maxlen=max_length)
mal_test_seq = pad_sequences(tokenizer.texts_to_sequences(mal_test['DATA']), maxlen=max_length)
tam_train_seq = pad_sequences(tokenizer.texts_to_sequences(tam_train['DATA']), maxlen=max_length)
tam_test_seq = pad_sequences(tokenizer.texts_to_sequences(tam_test['Data']), maxlen=max_length)

# Model definitions
def create_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
 F
def create_bilstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64)),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_bilstm_attention_model():
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_layer)
    bilstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
    attention_layer = Attention()([bilstm_layer, bilstm_layer])
    attention_pooling = GlobalAveragePooling1D()(attention_layer)
    dense_layer = Dense(24, activation='relu')(attention_pooling)
    output_layer = Dense(1, activation='sigmoid')(dense_layer)
    return tf.keras.Model(inputs=input_layer, outputs=output_layer)

def create_keras_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_glove_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_glove_bilstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=False),
        Bidirectional(LSTM(64)),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

# Function to calculate G-mean
def calculate_gmean(conf_matrix):
    TN, FP, FN, TP = conf_matrix.ravel()
    sensitivity = TP / (TP + FN)  # Recall
    specificity = TN / (TN + FP)
    gmean = np.sqrt(sensitivity * specificity)
    return gmean

# Train and evaluate function
def train_and_evaluate(model, train_data, train_labels, test_data, test_labels, language, model_name):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_data, train_labels, epochs=10, verbose=2)
    predictions = (model.predict(test_data) > 0.5).astype(int)

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='binary')
    accuracy = accuracy_score(test_labels, predictions)
    conf_matrix = confusion_matrix(test_labels, predictions)
    gmean = calculate_gmean(conf_matrix)

    # Print the results
    print(f"{language} - {model_name}:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")
    print(f"Accuracy: {accuracy}")
    print(f"G-mean: {gmean}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("="*50)

# Instantiate and evaluate models
models = {
    'CNN': create_cnn_model(),
    'BiLSTM': create_bilstm_model(),
    'BiLSTM + Attention': create_bilstm_attention_model(),
    'Keras + CNN': create_keras_cnn_model(),
    'GloVe + CNN': create_glove_cnn_model(),
    'GloVe + BiLSTM': create_glove_bilstm_model()
}

# Evaluate for Malayalam and Tamil
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} model for Malayalam:")
    train_and_evaluate(model, mal_train_seq, mal_train_labels, mal_test_seq, mal_test_labels, "Malayalam", model_name)
    print(f"\nEvaluating {model_name} model for Tamil:")
    train_and_evaluate(model, tam_train_seq, tam_train_labels, tam_test_seq, tam_test_labels, "Tamil", model_name)





Evaluating CNN model for Malayalam:
Epoch 1/10
25/25 - 3s - 134ms/step - accuracy: 0.4812 - loss: 0.6938
Epoch 2/10
25/25 - 1s - 39ms/step - accuracy: 0.5038 - loss: 0.6972
Epoch 3/10
25/25 - 1s - 53ms/step - accuracy: 0.5938 - loss: 0.6879
Epoch 4/10
25/25 - 1s - 45ms/step - accuracy: 0.7425 - loss: 0.6459
Epoch 5/10
25/25 - 1s - 43ms/step - accuracy: 0.8200 - loss: 0.4975
Epoch 6/10
25/25 - 1s - 39ms/step - accuracy: 0.9038 - loss: 0.3049
Epoch 7/10
25/25 - 1s - 55ms/step - accuracy: 0.9475 - loss: 0.1867
Epoch 8/10
25/25 - 1s - 53ms/step - accuracy: 0.9613 - loss: 0.1358
Epoch 9/10
25/25 - 1s - 36ms/step - accuracy: 0.9812 - loss: 0.0822
Epoch 10/10
25/25 - 1s - 60ms/step - accuracy: 0.9887 - loss: 0.0576
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Malayalam - CNN:
Precision: 0.7476635514018691
Recall: 0.8
F1: 0.7729468599033816
Accuracy: 0.765
G-mean: 0.764198926981712
Confusion Matrix:
[[73 27]
 [20 80]]

Evaluating CNN model for Tamil:
Epoch 1/10
26/2

  _warn_prf(average, modifier, msg_start, len(result))


25/25 - 2s - 94ms/step - accuracy: 0.5150 - loss: 0.6936
Epoch 2/10
25/25 - 1s - 35ms/step - accuracy: 0.5713 - loss: 0.6900
Epoch 3/10
25/25 - 1s - 33ms/step - accuracy: 0.7563 - loss: 0.6559
Epoch 4/10
25/25 - 1s - 51ms/step - accuracy: 0.8075 - loss: 0.5239
Epoch 5/10
25/25 - 1s - 32ms/step - accuracy: 0.8950 - loss: 0.3503
Epoch 6/10
25/25 - 1s - 32ms/step - accuracy: 0.9212 - loss: 0.2288
Epoch 7/10
25/25 - 1s - 33ms/step - accuracy: 0.9600 - loss: 0.1335
Epoch 8/10
25/25 - 1s - 32ms/step - accuracy: 0.9787 - loss: 0.0784
Epoch 9/10
25/25 - 1s - 33ms/step - accuracy: 0.9925 - loss: 0.0491
Epoch 10/10
25/25 - 1s - 32ms/step - accuracy: 0.9912 - loss: 0.0362
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Malayalam - Keras + CNN:
Precision: 0.6984126984126984
Recall: 0.88
F1: 0.7787610619469026
Accuracy: 0.75
G-mean: 0.7386474125047755
Confusion Matrix:
[[62 38]
 [12 88]]

Evaluating Keras + CNN model for Tamil:
Epoch 1/10
26/26 - 2s - 88ms/step - accuracy: 0

  _warn_prf(average, modifier, msg_start, len(result))


26/26 - 3s - 105ms/step - accuracy: 0.4728 - loss: 0.6978
Epoch 2/10
26/26 - 1s - 33ms/step - accuracy: 0.4530 - loss: 0.6990
Epoch 3/10
26/26 - 1s - 23ms/step - accuracy: 0.5074 - loss: 0.6947
Epoch 4/10
26/26 - 1s - 23ms/step - accuracy: 0.4889 - loss: 0.6968
Epoch 5/10
26/26 - 1s - 22ms/step - accuracy: 0.4765 - loss: 0.6942
Epoch 6/10
26/26 - 1s - 22ms/step - accuracy: 0.5012 - loss: 0.6919
Epoch 7/10
26/26 - 1s - 24ms/step - accuracy: 0.5470 - loss: 0.6900
Epoch 8/10
26/26 - 1s - 22ms/step - accuracy: 0.5309 - loss: 0.6932
Epoch 9/10
26/26 - 1s - 23ms/step - accuracy: 0.5136 - loss: 0.6912
Epoch 10/10
26/26 - 1s - 23ms/step - accuracy: 0.4988 - loss: 0.6948
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Tamil - GloVe + CNN:
Precision: 0.0
Recall: 0.0
F1: 0.0
Accuracy: 0.52
G-mean: 0.0
Confusion Matrix:
[[52  0]
 [48  0]]

Evaluating GloVe + BiLSTM model for Malayalam:
Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))


25/25 - 6s - 229ms/step - accuracy: 0.5587 - loss: 0.6855
Epoch 2/10
25/25 - 2s - 99ms/step - accuracy: 0.6162 - loss: 0.6518
Epoch 3/10
25/25 - 2s - 93ms/step - accuracy: 0.6662 - loss: 0.6096
Epoch 4/10
25/25 - 2s - 99ms/step - accuracy: 0.6737 - loss: 0.5768
Epoch 5/10
25/25 - 2s - 94ms/step - accuracy: 0.6825 - loss: 0.5656
Epoch 6/10
25/25 - 2s - 93ms/step - accuracy: 0.6963 - loss: 0.5511
Epoch 7/10
25/25 - 2s - 91ms/step - accuracy: 0.7088 - loss: 0.5339
Epoch 8/10
25/25 - 2s - 89ms/step - accuracy: 0.7300 - loss: 0.5116
Epoch 9/10
25/25 - 2s - 93ms/step - accuracy: 0.7387 - loss: 0.5040
Epoch 10/10
25/25 - 3s - 106ms/step - accuracy: 0.7525 - loss: 0.5069
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 82ms/step
Malayalam - GloVe + BiLSTM:
Precision: 0.7094017094017094
Recall: 0.83
F1: 0.7649769585253456
Accuracy: 0.745
G-mean: 0.7401351227985333
Confusion Matrix:
[[66 34]
 [17 83]]

Evaluating GloVe + BiLSTM model for Tamil:
Epoch 1/10
26/26 - 6s - 230ms/step - a

# DL with macro F1-score

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, LSTM, Bidirectional, Attention, Input
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, f1_score

# Parameters
vocab_size = 10000
max_length = 200
embedding_dim = 100  # Embedding dimension
oov_tok = "<OOV>"

# Load datasets
mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')
tam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')

# Normalize labels
label_mapping = {'human': 0, 'ai': 1}
def convert_labels(labels):
    return np.array([label_mapping[label.lower()] for label in labels])

mal_train_labels = convert_labels(mal_train['LABEL'])
mal_test_labels = convert_labels(mal_test['Label'])
tam_train_labels = convert_labels(tam_train['LABEL'])
tam_test_labels = convert_labels(tam_test['Label'])

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(pd.concat([mal_train['DATA'], tam_train['DATA']]))  # Combine text for tokenizing
mal_train_seq = pad_sequences(tokenizer.texts_to_sequences(mal_train['DATA']), maxlen=max_length)
mal_test_seq = pad_sequences(tokenizer.texts_to_sequences(mal_test['DATA']), maxlen=max_length)
tam_train_seq = pad_sequences(tokenizer.texts_to_sequences(tam_train['DATA']), maxlen=max_length)
tam_test_seq = pad_sequences(tokenizer.texts_to_sequences(tam_test['Data']), maxlen=max_length)

# Model definitions
def create_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_bilstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64)),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_bilstm_attention_model():
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_layer)
    bilstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
    attention_layer = Attention()([bilstm_layer, bilstm_layer])
    attention_pooling = GlobalAveragePooling1D()(attention_layer)
    dense_layer = Dense(24, activation='relu')(attention_pooling)
    output_layer = Dense(1, activation='sigmoid')(dense_layer)
    return tf.keras.Model(inputs=input_layer, outputs=output_layer)

def create_keras_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_glove_cnn_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

def create_glove_bilstm_model():
    return Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=False),
        Bidirectional(LSTM(64)),
        Dense(24, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

# Function to calculate G-mean
def calculate_gmean(conf_matrix):
    TN, FP, FN, TP = conf_matrix.ravel()
    sensitivity = TP / (TP + FN)  # Recall
    specificity = TN / (TN + FP)
    gmean = np.sqrt(sensitivity * specificity)
    return gmean

# Train and evaluate function
def train_and_evaluate(model, train_data, train_labels, test_data, test_labels, language, model_name):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(train_data, train_labels, epochs=10, verbose=2)
    predictions = (model.predict(test_data) > 0.5).astype(int)

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average='binary')
    macro_f1 = f1_score(test_labels, predictions, average='macro')
    accuracy = accuracy_score(test_labels, predictions)
    conf_matrix = confusion_matrix(test_labels, predictions)
    gmean = calculate_gmean(conf_matrix)

    # Print the results
    print(f"{language} - {model_name}:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1: {f1}")
    print(f"Macro F1: {macro_f1}")
    print(f"Accuracy: {accuracy}")
    print(f"G-mean: {gmean}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("="*50)

# Instantiate and evaluate models
models = {
    'CNN': create_cnn_model(),
    'BiLSTM': create_bilstm_model(),
    'BiLSTM + Attention': create_bilstm_attention_model(),
    'Keras + CNN': create_keras_cnn_model(),
    'GloVe + CNN': create_glove_cnn_model(),
    'GloVe + BiLSTM': create_glove_bilstm_model()
}

# Evaluate for Malayalam and Tamil
for model_name, model in models.items():
    print(f"\nEvaluating {model_name} model for Malayalam:")
    train_and_evaluate(model, mal_train_seq, mal_train_labels, mal_test_seq, mal_test_labels, "Malayalam", model_name)
    print(f"\nEvaluating {model_name} model for Tamil:")
    train_and_evaluate(model, tam_train_seq, tam_train_labels, tam_test_seq, tam_test_labels, "Tamil", model_name)



Evaluating CNN model for Malayalam:
Epoch 1/10




25/25 - 2s - 96ms/step - accuracy: 0.4850 - loss: 0.6947
Epoch 2/10
25/25 - 1s - 50ms/step - accuracy: 0.4850 - loss: 0.6935
Epoch 3/10
25/25 - 1s - 33ms/step - accuracy: 0.5950 - loss: 0.6882
Epoch 4/10
25/25 - 1s - 33ms/step - accuracy: 0.6988 - loss: 0.6562
Epoch 5/10
25/25 - 1s - 34ms/step - accuracy: 0.8338 - loss: 0.5383
Epoch 6/10
25/25 - 1s - 35ms/step - accuracy: 0.8775 - loss: 0.3600
Epoch 7/10
25/25 - 1s - 33ms/step - accuracy: 0.9337 - loss: 0.2070
Epoch 8/10
25/25 - 1s - 52ms/step - accuracy: 0.9600 - loss: 0.1233
Epoch 9/10
25/25 - 1s - 33ms/step - accuracy: 0.9750 - loss: 0.0761
Epoch 10/10
25/25 - 1s - 40ms/step - accuracy: 0.9875 - loss: 0.0471
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Malayalam - CNN:
Precision: 0.6942148760330579
Recall: 0.84
F1: 0.7601809954751132
Macro F1: 0.7320458050001264
Accuracy: 0.735
G-mean: 0.7274613391789284
Confusion Matrix:
[[63 37]
 [16 84]]

Evaluating CNN model for Tamil:
Epoch 1/10
26/26 - 2s - 86ms/step

# Transformer for mala tam

In [None]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Function to prepare datasets for training
def prepare_datasets(texts, labels):
    return texts.tolist(), np.array(labels)

# Load datasets
mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')
tam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')


#mal_train = pd.read_csv('/kaggle/input/final-dataset/mal_training_data_hum_ai.csv')
#mal_test = pd.read_excel('/kaggle/input/final-dataset/mal_test.xlsx')
#tam_train = pd.read_csv('/kaggle/input/final-dataset/tam_training_data_hum_ai.csv')
#tam_test = pd.read_excel('/kaggle/input/final-dataset/tamil-test.xlsx')

# Encode labels
label_encoder = LabelEncoder()
mal_train_labels = label_encoder.fit_transform(mal_train['LABEL'])
mal_test_labels = label_encoder.transform(mal_test['Label'])
tam_train_labels = label_encoder.fit_transform(tam_train['LABEL'])
tam_test_labels = label_encoder.transform(tam_test['Label'])

# Tokenization and model preparation
def train_and_evaluate_transformer(model, tokenizer, train_texts, train_labels, test_texts, test_labels, language, model_name):
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="tf")
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="tf")

    train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(16)
    test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(16)

    model.compile(optimizer='adam', loss=model.compute_loss, metrics=['accuracy'])
    print(f"Training {model_name} for {language}...")
    model.fit(train_dataset, epochs=3)
    print(f"Evaluating {model_name} for {language}...")
    eval_results = model.evaluate(test_dataset)
    print(f"{language} - {model_name} - Loss: {eval_results[0]}, Accuracy: {eval_results[1]}")

# Initialize models and tokenizers
models_info = {
    'DistilBERT': ('distilbert-base-uncased', AutoTokenizer, TFAutoModelForSequenceClassification),
    'MuRIL': ('google/muril-base-cased', AutoTokenizer, TFAutoModelForSequenceClassification),
    'RoBERTa': ('roberta-base', AutoTokenizer, TFAutoModelForSequenceClassification)
}

for model_name, (model_path, tokenizer_class, model_class) in models_info.items():
    tokenizer = tokenizer_class.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path, num_labels=len(label_encoder.classes_))
    print(f"\nProcessing {model_name}...")

    # Training and evaluating for Malayalam
    mal_train_texts, mal_train_labels = prepare_datasets(mal_train['DATA'], mal_train_labels)
    mal_test_texts, mal_test_labels = prepare_datasets(mal_test['DATA'], mal_test_labels)
    train_and_evaluate_transformer(model, tokenizer, mal_train_texts, mal_train_labels, mal_test_texts, mal_test_labels, "Malayalam", model_name)

    # Training and evaluating for Tamil
    tam_train_texts, tam_train_labels = prepare_datasets(tam_train['DATA'], tam_train_labels)
    tam_test_texts, tam_test_labels = prepare_datasets(tam_test['DATA'], tam_test_labels)
    train_and_evaluate_transformer(model, tokenizer, tam_train_texts, tam_train_labels, tam_test_texts, tam_test_labels, "Tamil", model_name)


In [None]:
mal_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_training_data_hum_ai.csv')
mal_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/mal_test.xlsx - Sheet1.csv')
tam_train = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tam_training_data_hum_ai.csv')
tam_test = pd.read_csv('/kaggle/input/ai-review-ml-dl-transformer/tamil-test.xlsx - Sheet1.csv')
