# Importing libraries and dataset preprocessing

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Load data
data = pd.read_csv('Book2.csv')

# Define a function for text cleaning
def clean_text(text):
    # Lowercasing
    text = text.lower()
    
    # Removing URLs and User Mentions
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@[\w_]+', '', text)
    
    # Tokenization and Removing Punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    
    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into text
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Apply the cleaning function to the 'Tweets' column
data['cleaned_text'] = data['Tweets'].apply(clean_text)

# Removing Duplicates
data.drop_duplicates(subset=['cleaned_text'], inplace=True)

# Handling Missing Values
data.dropna(subset=['cleaned_text'], inplace=True)

# Now, 'data' contains the cleaned text ready for further analysis
# Optionally, you can save the cleaned data to a new CSV file
data.to_csv('cleaned_data.csv', index=False)
print("done cleaning")
data



done cleaning


Unnamed: 0,Tweets,Category,cleaned_text
0,Excited to watch the new superhero movie tonight!,cinema,excited watch new superhero movie tonight
1,The latest smartphone features are impressive,technology,latest smartphone features impressive
2,Our team won the championship! What a game,sports,team championship game
3,Enjoyed the new action-packed movie last night,cinema,enjoyed new movie last night
4,Can't believe our team made it to the finals,sports,ca believe team made finals
...,...,...,...
1130,The precision in archery and the strategy in c...,sports,precision archery strategy chess share element...
1132,The commitment and training required in sports...,sports,commitment training required sports mirror ded...
1133,The dynamics of teamwork in sports teams echo ...,sports,dynamics teamwork sports teams echo importance...
1135,Cinema takes us on emotional journeys sparking...,cinema,cinema takes us emotional journeys sparking im...


# Feature extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


# Load cleaned data
data = pd.read_csv('cleaned_data.csv')

# Assuming you have a 'Category' column for labels
X = data['Tweets']  # Feature: Tweets
y = data['Category']  # Labels: Category

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features

# Transform the text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Now, X_tfidf contains the TF-IDF features
# You can use X_tfidf for model training

# Data Splitting and training 

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize the SVM classifier
domain_classifier = SVC()

# Train the model on the training data
domain_classifier.fit(X_train, y_train)

print(X_test)
# Predict the labels on the testing data
y_pred = domain_classifier.predict(X_test)



  (0, 967)	0.3242062454023185
  (0, 126)	0.3130248707063763
  (0, 507)	0.29214694508464695
  (0, 184)	0.2887689400861184
  (0, 141)	0.2194351792618703
  (0, 440)	0.3307691182296963
  (0, 655)	0.29574303262248713
  (0, 738)	0.25900985453566816
  (0, 461)	0.3130248707063763
  (0, 248)	0.24394927258795795
  (0, 574)	0.2501467669410691
  (0, 365)	0.2099094077293855
  (0, 603)	0.08097613684050885
  (0, 885)	0.15853617812617907
  (0, 41)	0.07666163891249851
  (0, 886)	0.06548026421655631
  (0, 902)	0.100357564641463
  (1, 137)	0.4587754771154074
  (1, 828)	0.37779536996395574
  (1, 492)	0.22606896634821608
  (1, 661)	0.4076827180661518
  (1, 32)	0.34014176443363026
  (1, 41)	0.1014146622606449
  (1, 774)	0.4587754771154074
  (1, 49)	0.27459312857209306
  :	:
  (221, 165)	0.3447777595522013
  (221, 60)	0.17154184938636138
  (221, 113)	0.25017202361894814
  (221, 529)	0.18631170051913482
  (221, 492)	0.16505958660602368
  (221, 41)	0.07404582105598524
  (221, 486)	0.3131438092485012
  (221, 61

# Manual input

In [7]:


# Manual input for prediction

manual_input = "This is a cricket match that i will never forget"


cleaned_input = clean_text(manual_input)
print(cleaned_input)
# Vectorize the cleaned input using the TF-IDF vectorize


vectorized_input = tfidf_vectorizer.transform([cleaned_input])
# Predict the domain label on the manual input

predicted_domain_label = domain_classifier.predict(vectorized_input)

print("Predicted Label:", predicted_domain_label[0])


# Convert the predicted label to the corresponding domain category
predicted_domain_category = predicted_domain_label[0]

print("Predicted Domain:", predicted_domain_category)


cricket match never forget
Predicted Label: sports
Predicted Domain: sports


In [8]:
import joblib


joblib.dump(domain_classifier, 'domain_classifier_model.pkl')
# Save the TF-IDF vectorizer


['domain_classifier_model.pkl']

In [9]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score

# Assuming you have already trained your classifier and made predictions as mentioned earlier

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1-score (you can specify the average and labels as needed)
precision = precision_score(y_test, y_pred, average='weighted')  # You can change 'average' as needed
recall = recall_score(y_test, y_pred, average='weighted')  # You can change 'average' as needed
f1 = f1_score(y_test, y_pred, average='weighted')  # You can change 'average' as needed

# Generate a classification report
classification_rep = classification_report(y_test, y_pred)

# Print the metrics and classification report
print("Domain Classification Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_rep)


Domain Classification Metrics:
Accuracy: 0.9596412556053812
Precision: 0.9630261405342738
Recall: 0.9596412556053812
F1 Score: 0.9599076125104458

Classification Report:
               precision    recall  f1-score   support

      cinema       0.99      0.95      0.97        88
      sports       0.90      1.00      0.95        73
  technology       1.00      0.92      0.96        62

    accuracy                           0.96       223
   macro avg       0.96      0.96      0.96       223
weighted avg       0.96      0.96      0.96       223

