In [1]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string


In [3]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/IMDBText



Mounted at /content/drive
/content/drive/MyDrive/IMDBText


In [4]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocessing

In [5]:
# Function for text preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return ' '.join(tokens)

In [6]:
# Preprocess the text data
train_data['review'] = train_data['review'].apply(preprocess_text)
test_data['review'] = test_data['review'].apply(preprocess_text)


## TF-IDF

In [7]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Transform training and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['review'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['review'])
y_train = train_data['sentiment']
y_test = test_data['sentiment']

## Different machine learning models on the training data

### **Naive Bayes**

In [8]:
# Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
start_time = time.time()


In [9]:
# Make predictions on the test data
nb_pred = nb_classifier.predict(X_test_tfidf)
nb_training_time = time.time() - start_time


In [10]:
# Evaluate the accuracy of models on the test data
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Training Time:", nb_training_time, "seconds")


Naive Bayes Accuracy: 0.8573
Naive Bayes Training Time: 0.023512601852416992 seconds


#### **Naive Bayes Accuracy: 0.8573**

### **Random Forest**

In [11]:
# Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)
start_time = time.time()

In [12]:
# Make predictions on the test data
rf_pred = rf_classifier.predict(X_test_tfidf)
rf_training_time = time.time() - start_time


In [13]:
# Evaluate the accuracy of models on the test data
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Training Time:", rf_training_time, "seconds")


Random Forest Accuracy: 0.85015
Random Forest Training Time: 1.121053695678711 seconds


#### **Random Forest Accuracy: 0.85015**

### **SVM**

In [14]:
# Initialize SVM classifier
svm_classifier = SVC(kernel='linear')
start_time = time.time()
svm_classifier.fit(X_train_tfidf, y_train)
svm_training_time = time.time() - start_time


In [15]:
# Evaluate accuracy of SVM
svm_pred = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_pred)

print("SVM Accuracy on Test Data:", svm_accuracy)
print("SVM Training Time:", svm_training_time, "seconds")

SVM Accuracy on Test Data: 0.8866
SVM Training Time: 625.0415663719177 seconds


#### **SVM Accuracy on Test Data: 0.8866**

### **GBM**

In [16]:
# Initialize GBM classifier
gbm_classifier = GradientBoostingClassifier()
start_time = time.time()
gbm_classifier.fit(X_train_tfidf, y_train)
gbm_training_time = time.time() - start_time


In [17]:
# Evaluate accuracy of GBM
gbm_pred = gbm_classifier.predict(X_test_tfidf)
gbm_accuracy = accuracy_score(y_test, gbm_pred)

print("GBM Accuracy on Test Data:", gbm_accuracy)
print("GBM Training Time:", gbm_training_time, "seconds")

GBM Accuracy on Test Data: 0.81315
GBM Training Time: 144.13138127326965 seconds


#### **GBM Accuracy on Test Data: 0.81315**