In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
import re
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [3]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VEERENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VEERENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VEERENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\VEERENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\VEERENDRA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [5]:
def preprocess_text(text):
    """
    Comprehensive text preprocessing function that handles various aspects of text cleaning
    
    Parameters:
        text (str): Input text to be preprocessed
        
    Returns:
        str: Cleaned and preprocessed text
    """
    # Convert to string if not already
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove hashtags and mentions
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Handle contractions (e.g., "don't" -> "do not")
    text = contractions.fix(text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join tokens back into text
    return ' '.join(tokens)


In [22]:
df = pd.read_csv('train3.csv')

In [10]:
# import pandas as pd


# # Take the first 10,000 rows
# df = data.iloc[:10000]

# # Display the subset data
# print(df.head())




                                                Text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [23]:
df.shape

(162980, 2)

In [24]:
print("Preprocessing texts...")
df['processed_text'] = df['Text'].apply(preprocess_text)

Preprocessing texts...


In [25]:
df.head

<bound method NDFrame.head of                                                      Text  category  \
0       when modi promised “minimum government maximum...      -1.0   
1       talk all the nonsense and continue all the dra...       0.0   
2       what did just say vote for modi  welcome bjp t...       1.0   
3       asking his supporters prefix chowkidar their n...       1.0   
4       answer who among these the most powerful world...       1.0   
...                                                   ...       ...   
162975  why these 456 crores paid neerav modi not reco...      -1.0   
162976  dear rss terrorist payal gawar what about modi...      -1.0   
162977  did you cover her interaction forum where she ...       0.0   
162978  there big project came into india modi dream p...       0.0   
162979  have you ever listen about like gurukul where ...       1.0   

                                           processed_text  
0       modi promised minimum government maximum gover...

In [26]:
nan_count = df.isna().sum()
nan_count

Text              8699
category          8698
processed_text       0
dtype: int64

In [27]:
df = df.dropna()

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], 
    df['category'],
    test_size=0.2, 
    random_state=42,
    stratify=df['category']  # Ensure balanced split
)


In [29]:
print("Creating TF-IDF vectors...")
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features to prevent overfitting
    ngram_range=(1, 2),  # Use both unigrams and bigrams
    min_df=5  # Minimum document frequency
)

# Transform the text data into TF-IDF features
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

Creating TF-IDF vectors...


In [35]:
# model = LogisticRegression(
#     multi_class='ovr',  # One-vs-Rest strategy
#     max_iter=1000,      # Increase max iterations to ensure convergence
#     C=1.0,             # Inverse of regularization strength
#     random_state=42
# )

# # Fit the model
# model.fit(X_train_tfidf, y_train)

In [30]:
from sklearn.svm import SVC

In [31]:
def train_evaluate_svm(kernel='linear'):
    print(f"\nTraining SVM with {kernel} kernel...")
    
    # Initialize SVM
    svm_model = SVC(
        kernel=kernel,
        C=1.0,
        random_state=42,
        probability=True
    )
    
    # Train the model
    svm_model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = svm_model.predict(X_test_tfidf)
    
    # Print classification report
    print(f"\nClassification Report for {kernel} kernel SVM:")
    print(classification_report(y_test, y_pred))
    
    return svm_model, y_pred

In [32]:
print("\n=== Linear Kernel SVM ===")
linear_svm, linear_predictions = train_evaluate_svm('linear')



=== Linear Kernel SVM ===

Training SVM with linear kernel...

Classification Report for linear kernel SVM:
              precision    recall  f1-score   support

        -1.0       0.85      0.77      0.81      6746
         0.0       0.84      0.96      0.90     10464
         1.0       0.93      0.87      0.90     13646

    accuracy                           0.88     30856
   macro avg       0.87      0.87      0.87     30856
weighted avg       0.88      0.88      0.88     30856



In [34]:
import pickle

model_filename = "linear_svm_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(linear_svm, file)

print(f"\nModel saved to {model_filename}")


Model saved to linear_svm_model.pkl
