In [None]:
# Importing the libraries

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from mlxtend.plotting import plot_confusion_matrix
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# Setting the parameters
vocabSize = 10000
maxLen = 200

In [None]:
# Checking the encoding type of the data
import chardet

with open('/content/drive/MyDrive/Colab Notebooks/LP 5/imdb.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))
    print(result)

# Reading the dataset & dropping irrelevant features
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/LP 5/imdb.csv', encoding = result['encoding'])
df = df.drop(df[['Unnamed: 0', 'type', 'file']], axis=1)  # removing unncessary columns

# print(df[df['label'] == 'pos']) # to check only rows with positive reviews

In [None]:
# Cleans the labels to only have POSITIVE & NEGATIVE labels, maps them to 1 and 0 resp.

print(df['label'].unique())     # checks all unique label values
df = df[df['label'].isin(['pos', 'neg'])]   # removes unsup
df['label'] = df['label'].map({'pos': 1, 'neg': 0}).astype(int)
df.head()

In [None]:
# Split the data into training and testing parts
x_train, x_test, y_train, y_test = train_test_split(df['review'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Set a tokenizer, and convert the numbers (labels) to sequences
# Example: "This movie was great" → [23, 55, 12, 875]

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocabSize, oov_token="<OOV>")   # it is OOV, not zero-zero-V
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [None]:
# Padding the sequences to limit the length of the review to 200 words only

x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=maxLen, padding='post')
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen=maxLen, padding='post')

In [None]:
# Building the Neural Network Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocabSize, output_dim=128, input_length=maxLen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train_pad, y_train, batch_size=128, epochs=5, validation_data=(x_test_pad, y_test))

In [None]:
# Accuracy vs Loss Plot

pd.DataFrame(history.history).plot(figsize=(8, 6))
plt.title('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.ylabel('Value')
plt.grid(True)  # optional
plt.show()

In [None]:
# Evaluation of Test Data

loss, accuracy = model.evaluate(x_test_pad, y_test)
print(f"Test Accuracy: {100 * accuracy:.2f}%")

In [None]:
# Make Predictions of each test review

y_pred = model.predict(x_test_pad).flatten()
y_pred = (y_pred > 0.5).astype(int) # if prob > 0.5, class 1 (pos) else class 0 (negative)

In [None]:
# Print Classification Report
print(metrics.classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, class_names=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.show()

### 🔹 **1. Imports & Setup**

```python
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from mlxtend.plotting import plot_confusion_matrix
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
```

- **Importing libraries**:
  - `tensorflow`: For building and training our deep learning model.
  - `pandas`: For handling the dataset (reading CSV and working with columns).
  - `matplotlib`: To make plots (like accuracy/loss curves).
  - `sklearn`: For evaluation metrics like accuracy and confusion matrix.
  - `mlxtend`: To help visualize the confusion matrix clearly.
  - `warnings`: To hide any unnecessary warnings in output.
- `%matplotlib inline`: Only used in Jupyter Notebooks to show plots directly below code cells.

---

### 🔹 **2. Set Parameters**

```python
vocab_size = 10000
max_len = 200
```

- `vocab_size`: How many **unique words** we want the tokenizer to keep (top 10,000 most common words).
- `max_len`: The **maximum number of words per review**. If a review is shorter, we pad it; if it's longer, we cut it.

---

### 🔹 **3. Load and Prepare Data**

```python
df = pd.read_csv('imdb_reviews.csv')  # Make sure this file has 'review' and 'sentiment' columns

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
```

- We load the dataset from a CSV file using pandas.
- The dataset should have two columns: `"review"` (text) and `"sentiment"` (positive/negative).
- We convert:
  - `"positive"` → `1`
  - `"negative"` → `0`  
  This is necessary because our model needs numbers to work, not words.

---

### 🔹 **4. Split the Dataset**

```python
x_train, x_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)
```

- We split our data:
  - `80%` for training the model (`x_train`, `y_train`)
  - `20%` for testing/evaluating it (`x_test`, `y_test`)
- `random_state=42` ensures we get the same split every time we run the code.

---

### 🔹 **5. Tokenization (Converting Words to Numbers)**

```python
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)
```

- We create a **tokenizer**, which converts text into sequences of numbers.
- It:
  - Keeps only the `vocab_size` most common words.
  - Uses `<OOV>` for words it doesn't recognize.
- `fit_on_texts`: It learns which words appear and assigns numbers to them.

---

### 🔹 **6. Convert Reviews to Number Sequences**

```python
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
```

- This converts each review into a list of numbers.
  - Example: `"This movie was great"` → `[23, 55, 12, 875]`

---

### 🔹 **7. Padding the Sequences**

```python
x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=max_len, padding='post')
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen=max_len, padding='post')
```

- Ensures every review is the **same length** (`max_len = 200`).
- If a review is too short → adds zeros at the end.
- If too long → cuts extra words from the end.

---

### 🔹 **8. Build the Neural Network**

```python
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
```

This is our **text classification model**:
1. **Embedding Layer**:
   - Turns word numbers into word vectors.
   - Learns word meanings as it trains.
2. **Flatten Layer**:
   - Converts 2D embeddings into 1D so the next layer can process it.
3. **Dense Layer (Hidden Layer)**:
   - A fully connected layer with 128 neurons.
   - Uses ReLU activation to introduce non-linearity.
4. **Dense Layer (Output Layer)**:
   - Only 1 neuron (because it's binary classification).
   - Uses sigmoid to output a probability between 0 and 1.

---

### 🔹 **9. Compile the Model**

```python
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
```

- **Optimizer**: `'adam'` adjusts weights to reduce error.
- **Loss**: `'binary_crossentropy'` is good for binary classification.
- **Metric**: We measure performance using `'accuracy'`.

---

### 🔹 **10. Train the Model**

```python
history = model.fit(x_train_pad, y_train, batch_size=128, epochs=5, validation_data=(x_test_pad, y_test))
```

- Trains the model for 5 rounds (epochs) using the training data.
- It also checks performance on test data after each epoch (`validation_data`).
- Stores training info in `history`.

---

### 🔹 **11. Plot Training History**

```python
pd.DataFrame(history.history).plot(figsize=(10,7))
plt.title("Training and Validation Metrics")
plt.xlabel("Epochs")
plt.ylabel("Value")
plt.grid(True)
plt.show()
```

- Plots how accuracy and loss change over the 5 epochs for both training and validation sets.

---

### 🔹 **12. Evaluate on Test Data**

```python
loss, accuracy = model.evaluate(x_test_pad, y_test)
print("Test Accuracy:", accuracy)
```

- Checks how well the model performs on **unseen test data**.

---

### 🔹 **13. Make Predictions**

```python
y_pred = model.predict(x_test_pad).flatten()
y_pred = (y_pred > 0.5).astype(int)
```

- Predicts **probabilities** for each test review.
- Then converts them to labels:
  - If probability > 0.5 → class 1 (positive)
  - Else → class 0 (negative)

---

### 🔹 **14. Show Classification Report**

```python
print(metrics.classification_report(y_test, y_pred))
```
- Shows precision, recall, and F1-score for both classes (positive and negative).

---

### 🔹 **15. Confusion Matrix**

```python
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, class_names=['Negative', 'Positive'])
plt.title("Confusion Matrix")
plt.show()
```

- Shows how many reviews were **correctly** or **incorrectly** classified.
- Helps spot if the model is better at one class than the other.

---