# Importing libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split  # Added import
from sklearn.svm import SVC  # Added import

# Load the data from the CSV file
data = pd.read_csv('twitter_training.csv')

# Dataset preprocessing

In [2]:
# Define a function to clean and preprocess the text
def clean_text(text):
    if isinstance(text, str):
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words]
        cleaned_text = ' '.join(filtered_words)
        cleaned_text = cleaned_text.lower()
        cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)
        if cleaned_text.strip():
            return cleaned_text
    return None

# Apply the cleaning function to the "Tweets" column
data['Cleaned Tweets'] = data['Tweets'].apply(clean_text)

# Drop rows with None values in the 'Cleaned Tweets' column
data.dropna(subset=['Cleaned Tweets'], inplace=True)

# Save the cleaned data to a new CSV file
data.to_csv('cleaned_twitter_training.csv', index=False)

# Data splitting and training and Feature extraction


In [4]:
# Remove rows with NaN values in the 'Cleaned Tweets' column
data = data.dropna(subset=['Cleaned Tweets'])

# Get the cleaned tweets as input data X
X = data['Cleaned Tweets']
y = data['Sentiment'] # Assuming you have a 'Sentiment' column

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize the SVM classifier
sentiment_classifier = SVC()

# Train the model on the training data
sentiment_classifier.fit(X_train, y_train)

# Predict the labels on the testing data
y_pred = sentiment_classifier.predict(X_test)

# Manual input


In [8]:
# Define a function to clean and preprocess a single text string
def clean_and_preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    cleaned_text = cleaned_text.lower()
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)
    return cleaned_text

# Input string to predict sentiment for
input_text = "you are good student"

# Clean and preprocess the input text
cleaned_input = clean_and_preprocess_text(input_text)

# Convert the preprocessed input text into a TF-IDF vector
input_tfidf = tfidf_vectorizer.transform([cleaned_input])
print(cleaned_input)
# Predict the sentiment label for the input text
predicted_sentiment = sentiment_classifier.predict(input_tfidf)

# Print the predicted sentiment
print("Predicted Sentiment:", predicted_sentiment[0])


good student
Predicted Sentiment: Positive


In [6]:
import joblib

joblib.dump(sentiment_classifier, 'sentiment_classifier_model.pkl')
# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer2.pkl')

['tfidf_vectorizer2.pkl']

In [7]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score

# Assuming you have already trained your classifier (sentiment_classifier) and made predictions (y_pred) as mentioned earlier

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1-score (you can specify the average and labels as needed)
precision = precision_score(y_test, y_pred, average='weighted')  # You can change 'average' as needed
recall = recall_score(y_test, y_pred, average='weighted')  # You can change 'average' as needed
f1 = f1_score(y_test, y_pred, average='weighted')  # You can change 'average' as needed

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Print the metrics and classification report
print("Sentiment Analysis Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_rep)


Sentiment Analysis Metrics:
Accuracy: 0.8498796009631923
Precision: 0.8512768706819004
Recall: 0.8498796009631923
F1 Score: 0.8496052108000753

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.87      0.77      0.82      2483
    Negative       0.86      0.89      0.88      4394
     Neutral       0.87      0.83      0.85      3606
    Positive       0.81      0.87      0.84      4052

    accuracy                           0.85     14535
   macro avg       0.85      0.84      0.85     14535
weighted avg       0.85      0.85      0.85     14535

