# Installing Required Libraries

In [2]:
!pip install nltk tensorflow scikit-learn pandas

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 

In [23]:
import nltk
import string
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jason/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jason/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jason/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
data = {
    'text': [
        'I love programming in Python',
        'Python is an amazing language',
        'I enjoy machine learning',
        'Deep learning is a subset of machine learning',
        'I dislike bugs in code',
        'Debugging can be frustrating',
        'I love solving problems',
        'I prefer Java over C++',
        'C++ is powerful but complex',
        'I enjoy reading about algorithms'
    ],
    'label': [
        'positive', 'positive', 'positive', 'positive', 'negative',
        'negative', 'positive', 'neutral', 'neutral', 'positive'
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Section A: Problem Description and Data Overview

## Problem Overview
This project addresses a **text classification** problem where the goal is to categorize textual data into predefined labels. Text classification is a common natural language processing (NLP) task used for applications like sentiment analysis, spam detection, and topic categorization.

## Data Description
The dataset consists of text data, where each piece of text corresponds to a label indicating its sentiment or category. The data includes 10 text samples and their respective labels:

- **Text**: Sentences expressing opinions or statements about programming and related topics.
- **Labels**: Categories include 'positive', 'negative', and 'neutral', representing the sentiment or stance of each sentence.

## Nature of the Problem
This is a **supervised learning problem** where the objective is to train a model to predict labels based on input text. The task is to develop a **text classification** model that can classify sentences into the correct categories.


In [25]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

In [26]:
def clean_text(text):
    # Tokenize the text and convert to lowercase
    words = nltk.word_tokenize(text.lower())

    # Remove stop words, punctuation, and special characters
    words = [word for word in words if word not in stop_words and word not in string.punctuation and word.isalnum()]

    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    
    # Alternatively, you can apply lemmatization instead of stemming
    # words = [lemmatizer.lemmatize(word) for word in words]
    
    return " ".join(words)

# Example usage
text = "I love programming in Python! It's amazing, isn't it? #PythonRocks"
cleaned_text = clean_text(text)
print(cleaned_text)

love program python amaz pythonrock


# Section B and C: Sentences to Words

## Sentence Segmentation
Sentence segmentation is the process of dividing a given text into individual sentences. This step helps in understanding the structure of the text and is essential for tasks like machine translation and text summarization. In this project, we ensure that each sentence is isolated for further processing.

## Converting to Lowercase
To ensure uniformity and reduce the complexity of the data, all text is converted to lowercase. This prevents the model from treating the same word with different cases (e.g., "Python" vs. "python") as separate entities.

## Tokenization of Words
Tokenization is the process of breaking a sentence into individual words or tokens. By splitting the text into smaller, meaningful units, tokenization allows for better analysis and manipulation of the data. In this project, we use word tokenization to extract the essential components (words) from each sentence for further processing.


In [9]:
df['text'] = df['text'].apply(clean_text)

# Convert labels to numerical format
df['label'] = df['label'].astype('category').cat.codes

In [11]:
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [13]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Section D: Converting Text Data to Machine Learnable Form

After preparing the text data, we need to convert it into a machine-learnable format that can be used by machine learning models, especially for classification tasks. This is typically done by transforming the text into numerical vectors using techniques like **TF-IDF** and converting the labels into categorical format.

## TF-IDF Transformation
The **TF-IDF (Term Frequency-Inverse Document Frequency)** model is used to convert the text data into a numerical form that captures the importance of each word in relation to the entire corpus. In the following code, the `tfidf_vectorizer` is used to transform the validation set (`X_val`) into TF-IDF feature vectors:



In [15]:
X_val_tfidf = tfidf_vectorizer.transform(X_val)
y_train = to_categorical(y_train, num_classes=3)
y_val = to_categorical(y_val, num_classes=3)

# Section E: Machine Learning Models

## Selecting the Appropriate Model
For this text classification problem, we have selected a **Neural Network** as the model of choice due to its ability to capture complex relationships in text data. Specifically, a **Sequential Neural Network** architecture is used, which is suitable for this multi-class classification problem with three classes: positive, negative, and neutral.

### Model Architecture
The model is a fully connected feedforward neural network built using Keras' **Sequential API**. Below is the architecture of the model:

In [16]:
model = Sequential([
    layers.InputLayer(input_shape=(X_train_tfidf.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')  # 3 classes: positive, negative, neutral
])



In [17]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [18]:
model.fit(X_train_tfidf, y_train, epochs=10, batch_size=4, validation_data=(X_val_tfidf, y_val))

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 174ms/step - accuracy: 0.4167 - loss: 1.0454 - val_accuracy: 0.0000e+00 - val_loss: 1.0681
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.6667 - loss: 1.0204 - val_accuracy: 0.5000 - val_loss: 1.0563
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.6667 - loss: 0.9773 - val_accuracy: 0.5000 - val_loss: 1.0462
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.7500 - loss: 0.9347 - val_accuracy: 0.5000 - val_loss: 1.0380
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.7500 - loss: 0.8998 - val_accuracy: 0.5000 - val_loss: 1.0325
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.8333 - loss: 0.8824 - val_accuracy: 0.5000 - val_loss: 1.0264
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x17667e190>

# Section F: Quality of Results Communication and Discussions

## Model Evaluation
After training the model, it's essential to evaluate its performance on the validation set to determine how well it generalizes to unseen data. This is done by computing metrics such as **loss** and **accuracy**, which provide insights into the model's effectiveness.

In [19]:
loss, accuracy = model.evaluate(X_val_tfidf, y_val)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5000 - loss: 1.0080
Validation Loss: 1.008040428161621, Validation Accuracy: 0.5
