In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D, Input
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("data.csv")
df = df[["target", "cleaned_text"]]
df["text"] = df["cleaned_text"]
df = df.drop(columns=["cleaned_text"], axis=1)
df.head()

Unnamed: 0,target,text
0,0,switchfoot httptwitpiccom2y1zl awww thats bumm...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many times ball managed save 50...
3,0,whole body feels itchy like fire
4,0,nationwideclass behaving im mad cant see


In [3]:
df.shape

(1600000, 2)

In [4]:
df.columns

Index(['target', 'text'], dtype='object')

In [5]:
df["target"].unique()

array([0, 1])

In [6]:
df["target"].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [7]:
# Check for Nan check
print(f"Target contains NaN {df["target"].isnull().sum()}")
print(f"Text contains NaN {df["text"].isnull().sum()}")


Target contains NaN 0
Text contains NaN 322


In [8]:
# target counts for null values
df[df.isnull().any(axis =1)]["target"].value_counts()

target
0    179
1    143
Name: count, dtype: int64

In [9]:
df_cleaned = df.dropna()

In [10]:
df_cleaned.shape

(1599678, 2)

In [11]:
df_sample = df_cleaned.sample(frac=0.1, random_state=42)

In [12]:
X, y = df_sample["text"], df_sample["target"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
print(f"Train size : {len(X_train)} and test size {len(X_test)}")

Train size : 127974 and test size 31994


In [15]:
def create_fcnn(input_shape):
    model = Sequential()
    
    model.add(Input(shape=input_shape))
    
    model.add(Dense(
        128, 
        activation='relu',
        kernel_regularizer=l2(0.001)  # L2 Regularization
    ))
    model.add(Dropout(0.5))  # Dropout
    
    model.add(Dense(
        64, 
        activation='relu',
        kernel_regularizer=l2(0.001)  # L2 Regularization
    ))
    model.add(Dropout(0.5))  # Dropout
    
    model.add(Dense(1, activation='softmax'))
    
    return model


In [29]:
def evaluate_model(model, X_test_processed, y_test_categorical):
    y_pred = model.predict(X_test_processed)
    y_true = y_test_categorical
    
    acc = accuracy_score(y_true, y_pred)
    print(f"\nTest Accuracy: {acc:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))
    
    return acc

### Bag of words

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vocab_size_bow = 10000
bow_vectorizer = CountVectorizer(max_features=vocab_size_bow)

In [17]:
X_train_bow = bow_vectorizer.fit_transform(X_train).toarray()

In [18]:
X_test_bow = bow_vectorizer.transform(X_test).toarray()

In [19]:
bow_model = create_fcnn(input_shape=(vocab_size_bow,))
bow_model.summary()

In [20]:
bow_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [21]:
history_bow = bow_model.fit(
    X_train_bow, y_train,
    epochs=7,
    batch_size=128,
    validation_data=(X_test_bow, y_test),
    verbose=1
)

Epoch 1/7


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.5035 - loss: 20.8581 - val_accuracy: 0.4962 - val_loss: 65.4746
Epoch 2/7
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5035 - loss: 150.8672 - val_accuracy: 0.4962 - val_loss: 253.6141
Epoch 3/7
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5035 - loss: 385.1405 - val_accuracy: 0.4962 - val_loss: 528.9077
Epoch 4/7
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5035 - loss: 696.2328 - val_accuracy: 0.4962 - val_loss: 873.5926
Epoch 5/7
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5035 - loss: 1070.9108 - val_accuracy: 0.4962 - val_loss: 1278.1554
Epoch 6/7
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5035 - loss: 1502.9104 - val_accuracy: 0.4962 - val_loss: 1738.0842
Epoch 

In [30]:
evaluate_model(bow_model, X_test_bow, y_test)

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 522us/step

Test Accuracy: 0.4962

Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00     16117
    Positive       0.50      1.00      0.66     15877

    accuracy                           0.50     31994
   macro avg       0.25      0.50      0.33     31994
weighted avg       0.25      0.50      0.33     31994



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


0.49624929674313933