In [38]:
import pandas as pd
# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
dataset=pd.read_csv("tweets.csv")

In [3]:
import re

def clean_tweet(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters, numbers
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

In [4]:
dataset.head()

Unnamed: 0,author,content
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...
1,katyperry,@barackobama Thank you for your incredible gra...
2,katyperry,Life goals. https://t.co/XIn1qKMKQl
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...


In [5]:
dataset['clean_content']= dataset["content"].apply(clean_tweet)

In [18]:
dataset.head()

Unnamed: 0,author,content,clean_content
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,is history repeating itself
1,katyperry,@barackobama Thank you for your incredible gra...,thank you for your incredible grace in leaders...
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,life goals
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,me right now
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters are doin it for themselves


In [6]:
import nltk
from nltk.tokenize import word_tokenize

dataset["tokens"] = dataset["clean_content"].apply(word_tokenize)

In [23]:
dataset.head()

Unnamed: 0,author,content,clean_content,tokens
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,is history repeating itself,"[is, history, repeating, itself]"
1,katyperry,@barackobama Thank you for your incredible gra...,thank you for your incredible grace in leaders...,"[thank, you, for, your, incredible, grace, in,..."
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,life goals,"[life, goals]"
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,me right now,"[me, right, now]"
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters are doin it for themselves,"[sisters, are, doin, it, for, themselves]"


In [7]:
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

dataset["tokens"] = dataset["tokens"].apply(lambda words: [word for word in words if word not in stop_words])


[nltk_data] Downloading package stopwords to C:\Users\Zeeshan
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
dataset.head()

Unnamed: 0,author,content,clean_content,tokens
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,is history repeating itself,"[history, repeating]"
1,katyperry,@barackobama Thank you for your incredible gra...,thank you for your incredible grace in leaders...,"[thank, incredible, grace, leadership, excepti..."
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,life goals,"[life, goals]"
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,me right now,[right]
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters are doin it for themselves,"[sisters, doin]"


In [8]:
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

dataset["tokens_lemmatized"] = dataset["tokens"].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])


[nltk_data] Downloading package wordnet to C:\Users\Zeeshan
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
dataset.head()

Unnamed: 0,author,content,clean_content,tokens,tokens_lemmatized
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,is history repeating itself,"[history, repeating]","[history, repeating]"
1,katyperry,@barackobama Thank you for your incredible gra...,thank you for your incredible grace in leaders...,"[thank, incredible, grace, leadership, excepti...","[thank, incredible, grace, leadership, excepti..."
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,life goals,"[life, goals]","[life, goal]"
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,me right now,[right],[right]
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters are doin it for themselves,"[sisters, doin]","[sister, doin]"


In [9]:
from nltk import pos_tag

nltk.download("averaged_perceptron_tagger")

dataset["pos_tags"] = dataset["tokens"].apply(pos_tag)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zeeshan Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [30]:
dataset.head()

Unnamed: 0,author,content,clean_content,tokens,tokens_lemmatized,pos_tags
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,is history repeating itself,"[history, repeating]","[history, repeating]","[(history, NN), (repeating, VBG)]"
1,katyperry,@barackobama Thank you for your incredible gra...,thank you for your incredible grace in leaders...,"[thank, incredible, grace, leadership, excepti...","[thank, incredible, grace, leadership, excepti...","[(thank, NN), (incredible, JJ), (grace, NN), (..."
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,life goals,"[life, goals]","[life, goal]","[(life, NN), (goals, NNS)]"
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,me right now,[right],[right],"[(right, NN)]"
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters are doin it for themselves,"[sisters, doin]","[sister, doin]","[(sisters, NNS), (doin, VBP)]"


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

dataset['author']=le.fit_transform(dataset['author'])

In [33]:
dataset.head()

Unnamed: 0,author,content,clean_content,tokens,tokens_lemmatized,pos_tags
0,14,Is history repeating itself...?#DONTNORMALIZEH...,is history repeating itself,"[history, repeating]","[history, repeating]","[(history, NN), (repeating, VBG)]"
1,14,@barackobama Thank you for your incredible gra...,thank you for your incredible grace in leaders...,"[thank, incredible, grace, leadership, excepti...","[thank, incredible, grace, leadership, excepti...","[(thank, NN), (incredible, JJ), (grace, NN), (..."
2,14,Life goals. https://t.co/XIn1qKMKQl,life goals,"[life, goals]","[life, goal]","[(life, NN), (goals, NNS)]"
3,14,Me right now 🙏🏻 https://t.co/gW55C1wrwd,me right now,[right],[right],"[(right, NN)]"
4,14,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters are doin it for themselves,"[sisters, doin]","[sister, doin]","[(sisters, NNS), (doin, VBP)]"


In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

dataset["sentiment"] = dataset["clean_content"].apply(lambda text: sia.polarity_scores(text)["compound"])


[nltk_data] Downloading package vader_lexicon to C:\Users\Zeeshan
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [20]:
dataset.tail()

Unnamed: 0,author,content,clean_content,tokens,tokens_lemmatized,pos_tags,sentiment
52537,9,Life couldn't be better right now. 😊,life couldnt be better right now,"[life, couldnt, better, right]","[life, couldnt, better, right]","[(life, NN), (couldnt, NN), (better, RBR), (ri...",-0.3412
52538,9,First Monday back in action. I'd say 21.6 mile...,first monday back in action id say miles is a ...,"[first, monday, back, action, id, say, miles, ...","[first, monday, back, action, id, say, mile, s...","[(first, RB), (monday, JJ), (back, RB), (actio...",0.5719
52539,9,"Crime shows, buddy, snuggles = the perfect Sun...",crime shows buddy snuggles the perfect sunday ...,"[crime, shows, buddy, snuggles, perfect, sunda...","[crime, show, buddy, snuggle, perfect, sunday,...","[(crime, NN), (shows, NNS), (buddy, VBP), (snu...",0.0516
52540,9,❄️ http://t.co/sHCFdPpGPa,,[],[],[],0.0
52541,9,❤️❄️✈️ http://t.co/ixmB5lv17Z,,[],[],[],0.0


In [None]:
import nltk
from nltk import pos_tag, ne_chunk

# Download required NLTK resources
nltk.download("maxent_ne_chunker")
nltk.download("words")
nltk.download("averaged_perceptron_tagger")  # Needed for POS tagging

# Apply Named Entity Recognition (NER)
dataset["named_entities"] = dataset["tokens"].apply(lambda tokens: ne_chunk(pos_tag(tokens)))


In [19]:
from nltk import ne_chunk

# POS-tagged sentence (input)
pos_tagged_sentence = [('life', 'NN'), ('couldnt', 'NN'), ('better', 'RBR'), ('right', 'NN')]

# Apply Named Entity Recognition (NER)
ner_tree = ne_chunk(pos_tagged_sentence)

# Print the result
print(ner_tree)


(S life/NN couldnt/NN better/RBR right/NN)


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)  # Limit features for efficiency
X = vectorizer.fit_transform(dataset["clean_content"])  # Transform text into TF-IDF vectors


In [22]:
from sklearn.model_selection import train_test_split

y = dataset["author"]  # Target variable (Author ID)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)


In [28]:
from sklearn.svm import SVC

model = SVC(kernel="linear")
model.fit(X_train, y_train)


In [29]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.47426015795984394
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.45      0.45       624
           1       0.90      0.86      0.88       588
           2       0.64      0.58      0.61       510
           3       0.46      0.43      0.45       587
           4       0.37      0.50      0.43       637
           5       0.64      0.51      0.57       450
           6       0.28      0.43      0.34       616
           7       0.45      0.42      0.43       606
           8       0.91      0.85      0.88       350
           9       0.31      0.20      0.24       428
          10       0.53      0.49      0.51       517
          11       0.62      0.53      0.57       640
          12       0.42      0.31      0.36       457
          13       0.39      0.29      0.33       410
          14       0.36      0.29      0.32       583
          15       0.53      0.35      0.42       451
          16       0.21    

In [30]:
new_content = ["Had a great time at the beach today!"]
new_content_tfidf = vectorizer.transform(new_content)  # Transform text into TF-IDF

predicted_author = model.predict(new_content_tfidf)
print("Predicted Author:", predicted_author[0])


Predicted Author: 13


In [31]:
le.inverse_transform(predicted_author)

array(['justinbieber'], dtype=object)

In [37]:
# Define Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel="linear"),
    "Naive Bayes": MultinomialNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}

# Train & Evaluate Each Model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # Make predictions

    # Evaluate Model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("=" * 50)


Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.4860
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.43      0.46       624
           1       0.87      0.89      0.88       588
           2       0.62      0.62      0.62       510
           3       0.48      0.45      0.46       587
           4       0.41      0.54      0.46       637
           5       0.63      0.53      0.57       450
           6       0.31      0.40      0.35       616
           7       0.47      0.42      0.44       606
           8       0.88      0.86      0.87       350
           9       0.28      0.19      0.23       428
          10       0.54      0.50      0.52       517
          11       0.58      0.56      0.57       640
          12       0.42      0.33      0.37       457
          13       0.39      0.26      0.31       410
          14       0.34      0.31      0.33       583
          15       0.48      0.36      0.41       451
          16       0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.4394
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.40      0.42       624
           1       0.85      0.83      0.84       588
           2       0.56      0.56      0.56       510
           3       0.52      0.38      0.44       587
           4       0.37      0.43      0.40       637
           5       0.60      0.51      0.55       450
           6       0.15      0.51      0.23       616
           7       0.43      0.35      0.38       606
           8       0.83      0.79      0.81       350
           9       0.25      0.18      0.21       428
          10       0.55      0.45      0.50       517
          11       0.52      0.52      0.52       640
          12       0.45      0.33      0.38       457
          13       0.36      0.30      0.33       410
          14       0.33      0.25      0.29       583
          15       0.46      0.30      0.36       451
          16       0.43      0.2

In [None]:
# Define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'kernel': ['linear', 'rbf'],   # Kernel type
    'gamma': ['scale', 'auto']     # Kernel coefficient
}

# Initialize GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Evaluate on test data
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)