In [None]:
# Installing kaggle package to work with Kaggle datasets
!pip install kaggle

In [None]:
# Creating a directory to store the kaggle.json API key file
!mkdir -p ~/.kaggle
# Copying the kaggle.json file to the .kaggle folder for authentication
!cp kaggle.json ~/.kaggle/
# Setting appropriate file permissions for kaggle.json to ensure it's secure
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Downloading the sentiment140 dataset from Kaggle
!kaggle datasets download -d kazanova/sentiment140

In [None]:
# Extracting the downloaded dataset from the zip file
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

In [None]:
# Importing necessary libraries for data handling and processing
import numpy as np
import pandas as pd
import re
import nltk


In [None]:
# Importing NLP-specific libraries for text preprocessing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
# Importing libraries for feature extraction, model training, and performance evaluation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Downloading stopwords for preprocessing (removing common words like "the", "is", etc.)
nltk.download('stopwords')


In [None]:
# Displaying the list of English stopwords
print(stopwords.words('english'))

In [None]:
# Loading the dataset into a pandas dataframe
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')


In [None]:
# Checking the number of rows and columns in the dataset
twitter_data.shape


In [None]:
# Displaying the first few rows of the dataset to understand its structure
twitter_data.head()

In [None]:
# Renaming the columns to make them more intuitive
column_names = ['target', 'date', 'id', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')


In [None]:
# Checking the updated shape of the dataset
twitter_data.shape

In [None]:
# Displaying the first few rows after renaming columns
twitter_data.head()

In [None]:
# Checking for any missing values in the dataset
twitter_data.isnull().sum()

In [None]:
# Checking the distribution of the target column (0 = negative, 4 = positive sentiment)
twitter_data['target'].value_counts()


In [None]:
# Converting the target value of 4 (positive sentiment) to 1 for binary classification
twitter_data.replace({'target': {4: 1}}, inplace=True)

In [None]:
# Verifying the updated target column distribution (0 = negative, 1 = positive)
twitter_data['target'].value_counts()

In [None]:
# Initializing the PorterStemmer for stemming (reducing words to their root form)
port_stem = PorterStemmer()


In [None]:
# Defining a function to clean and stem the text data
def stemming(content):
    # Removing all characters except alphabets
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    # Converting the text to lowercase
    stemmed_content = stemmed_content.lower()
    # Splitting the text into individual words
    stemmed_content = stemmed_content.split()
    # Stemming each word and removing stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    # Joining the stemmed words back into a sentence
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [None]:
# Applying the stemming function to the 'text' column and creating a new column for processed content
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)
# And it take around half an hour to load so dont thinks that something wrong in code

In [None]:
# Displaying the first few rows of the stemmed content
twitter_data.head()

In [None]:
# Displaying the processed 'stemmed_content' column
print(twitter_data['stemmed_content'])

In [None]:
# Displaying the 'target' column (sentiment labels)
print(twitter_data['target'])

In [None]:
# Separating the features (X) and labels (Y) from the dataset
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [None]:
# Displaying the feature data (X)
print(X)


In [None]:
# Displaying the label data (Y)
print(Y)


In [None]:
# Splitting the data into training and testing sets (80% training, 20% testing)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [None]:
# Displaying the shapes of the training and testing sets
print(X.shape, X_train.shape, X_test.shape)


In [None]:
# Displaying a sample of the training data
print(X_train)


In [None]:
# Displaying a sample of the testing data
print(X_test)

In [None]:
# Initializing the TfidfVectorizer to convert text data into numerical format (TF-IDF vectors)
vectorizer = TfidfVectorizer()

In [None]:
# Fitting the vectorizer on the training data and transforming it into TF-IDF features
X_train = vectorizer.fit_transform(X_train)

In [None]:
# Transforming the test data into TF-IDF features using the same vectorizer
X_test = vectorizer.transform(X_test)

In [None]:
# Displaying the transformed training data (sparse matrix format)
print(X_train)

In [None]:
# Displaying the transformed test data (sparse matrix format)
print(X_test)