In [None]:
import pandas as pd

# Read the text file using Pandas
data = pd.read_csv('/content/train.ft.txt', header=None, delimiter='\t')  # Assuming the data is tab-separated

# Print the content of the file
print(data.head(1000))


                                                     0
0    __label__2 Stuning even for the non-gamer: Thi...
1    __label__2 The best soundtrack ever to anythin...
2    __label__2 Amazing!: This soundtrack is my fav...
3    __label__2 Excellent Soundtrack: I truly like ...
4    __label__2 Remember, Pull Your Jaw Off The Flo...
..                                                 ...
995  __label__1 Fuzzy around the edges: I have only...
996  __label__1 Brain Based Learning: The New Parad...
997  __label__2 Brain based Learning: This is a tex...
998  __label__1 Pop psychology at its worst: I find...
999  __label__1 "Science": "On average, we breathe ...

[1000 rows x 1 columns]


In [None]:
import re

def clean_data(data_point):
    # Remove label prefix
    data_point = re.sub(r'__label__\d\s+', '', data_point)

    # Remove special characters, punctuation, and symbols
    data_point = re.sub(r'[^A-Za-z0-9\s]+', '', data_point)

    # Convert to lowercase
    data_point = data_point.lower()

    return data_point

# Open the dataset file
with open('train.ft.txt', 'r', encoding='utf-8') as file:
    # Read lines from the file
    lines = file.readlines()

# Initialize lists to store cleaned data points and their corresponding labels
cleaned_data_points = []
labels = []

# Iterate through each line in the file
for line in lines:
    # Extract the label from the line
    label = int(line.split()[0].split('__label__')[1])

    # Extract the data point from the line (excluding the label)
    data_point = ' '.join(line.split()[1:])

    # Clean the data point
    cleaned_data_point = clean_data(data_point)

    # Append the cleaned data point and label to their respective lists
    cleaned_data_points.append(cleaned_data_point)
    labels.append(label)

# Print the cleaned data points and their corresponding labels
for cleaned_data_point, label in zip(cleaned_data_points, labels):
    print(f"Label: {label}, Cleaned Data Point: {cleaned_data_point}")


In [None]:
with open('cleaned_dataset.txt', 'w', encoding='utf-8') as output_file:
    # Write cleaned data points and labels to the new file
    for cleaned_data_point, label in zip(cleaned_data_points, labels):
        output_file.write(f"{label} {cleaned_data_point}\n")

print("Cleaned data saved to 'cleaned_dataset.txt'")

Cleaned data saved to 'cleaned_dataset.txt'


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict labels for the testing set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

dump(classifier, 'logistic_regression_model_3.joblib')

Accuracy: 0.8465163244698755


['logistic_regression_model_3.joblib']

In [2]:
import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to clean and tokenize the text
def clean_and_tokenize(text):
    # Remove special characters, punctuation, and symbols
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Load the dataset
with open('/content/cleaned_dataset.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Extract data points and labels
data_points = []
labels = []
for line in lines:
    label, text = line.split(' ', 1)
    labels.append(int(label))
    data_points.append(text.strip())

# Clean and tokenize the data points
tokenized_data = [clean_and_tokenize(text) for text in data_points]

# Convert tokens into numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(tokenized_data).toarray()
y = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Training set shape: (11880, 1000) (11880,)
Testing set shape: (2971, 1000) (2971,)


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

# Assuming tokenized_data is your tokenized text data

# Create and fit the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_vectorizer.fit(tokenized_data)

# Save the TF-IDF vectorizer to a file
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import load
import numpy as np

# Load the TF-IDF vectorizer from the file
tfidf_vectorizer = load('tfidf_vectorizer.joblib')

# Assuming tokenized_data and labels are loaded from somewhere

# Transform tokenized data into TF-IDF features
X = tfidf_vectorizer.transform(tokenized_data).toarray()
y = np.array(labels)




In [8]:
from joblib import load
import numpy as np

# Load the trained classifier from the file
loaded_classifier = load('logistic_regression_model_3.joblib')

# Concatenate X_test and y_pred horizontally to match the original input format
X_combined = np.hstack((X_test[:, :-1], y_pred.reshape(-1, 1)))  # Assuming the last column in X_test is the label column

# Use the loaded classifier to make predictions
new_data_predictions = loaded_classifier.predict(X_combined)

# Print the predictions
print("Predictions:", new_data_predictions)



Predictions: [1 2 1 ... 1 1 1]


In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
confusion_matrix(y_test, y_pred)

array([[30342,  5405],
       [ 4874, 31804]])

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.86      0.85      0.86     27187
           2       0.86      0.87      0.86     28051

    accuracy                           0.86     55238
   macro avg       0.86      0.86      0.86     55238
weighted avg       0.86      0.86      0.86     55238



In [13]:
from joblib import load

# Load the trained classifier from the file
loaded_classifier = load('logistic_regression_model_3.joblib')

# Prompt the user to enter the text for prediction
user_input_text = input("Enter the text for prediction: ")

# Load the TF-IDF vectorizer used during training
vectorizer = load('tfidf_vectorizer.joblib')

# Transform the input text using the loaded vectorizer
user_input_vectorized = vectorizer.transform([user_input_text])

# Ensure the input is reshaped to match the expected shape (2D array)
user_input_vectorized = user_input_vectorized.toarray().reshape(1, -1)

# Use the loaded classifier to make predictions on the user input
new_data_predictions = loaded_classifier.predict(user_input_vectorized)

# Print the predictions
if new_data_predictions[0] == 1:
    print("Result: This review is negative")
else:
    print("Result: This review is positive")


Enter the text for prediction: this is bad
Result: This review is negative
