In [27]:
import random
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import nltk.classify.util
import numpy as np

In [28]:
import spacy  

# Load spaCy's English language model
nlp = spacy.load('en_core_web_sm') 

In [29]:
# Load twitter samples dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [30]:
# Combine the datasets and create labels
tweets = positive_tweets + negative_tweets
labels = ['Positive'] * len(positive_tweets) + ['Negative'] * len(negative_tweets)

In [31]:
# Shuffle the dataset
combined = list(zip(tweets, labels))
random.shuffle(combined)
tweets, labels = zip(*combined)

In [32]:
# Additional custom stopwords
custom_stopwords = set(["i'm", "rt", "u", "4", "2", "im", "ur", "don't", "amp"])
stop_words = set(stopwords.words('english')).union(custom_stopwords)

In [33]:
# Preprocessing function - Fine-tuning for NLP models: text preprocessing, stopword removal
def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [34]:
# Preprocess all tweets
processed_tweets = [preprocess_text(tweet) for tweet in tweets]

In [35]:
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tweets = [[stemmer.stem(word) for word in tweet] for tweet in processed_tweets]
lemmatized_tweets = [[lemmatizer.lemmatize(word) for word in tweet] for tweet in processed_tweets]

In [36]:
# Extract bigrams and unigrams
def get_bigrams_and_unigrams(tokens):
    bigrams = list(nltk.bigrams(tokens))
    bigrams = ['_'.join(bigram) for bigram in bigrams]
    return tokens + bigrams

In [37]:
# Combine bigrams with unigrams
all_words = [word.lower() for tweet in lemmatized_tweets for word in get_bigrams_and_unigrams(tweet)]
all_words_freq = FreqDist(all_words)

In [38]:
# Select the top 2000 words as features
word_features = list(all_words_freq.keys())[:2000]

In [39]:
# Feature scaling function
def scale_features(features):
    scaled_features = {}
    max_value = max(features.values()) if features else 1
    for word, present in features.items():
        scaled_features[word] = present / max_value
    return scaled_features

In [40]:
# Define document features
def document_features(document):
    document_words = set(document)
    features = {'contains({})'.format(word): (word in document_words) for word in word_features}
    return scale_features(features)

In [41]:
# Entity Recognition - Identify named entities in tweets
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [42]:
# Process tweets for entity recognition
for tweet in tweets[:5]:  # Limiting to first 5 tweets for brevity
    entities = extract_entities(tweet)
    if entities:
        print(f"Tweet: {tweet}")
        print(f"Entities: {entities}\n")

Tweet: Spring Fiesta 2015 

Make Some Noise, Awuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu!!!!!!!!! Aich :)
Entities: [('Spring Fiesta 2015', 'EVENT')]

Tweet: @avonfvcks ID BE SO GRATEFUL IF YOU GIFTED ME #NotAnApology :-)
Entities: [('@avonfvcks ID BE SO GRATEFUL', 'ORG')]

Tweet: @ClearlyArticle :) its 430 am smh
Entities: [('430', 'CARDINAL')]

Tweet: Last time I was here, was a funeral and a again funeral. Modimo ho tseba wena fela. :( — feeling emotional at... http://t.co/mQYsswdot7
Entities: [('Modimo', 'GPE'), ('http://t.co/mQYsswdot7', 'PERSON')]

Tweet: @kendrahatesu FOLLOWED ME THANKS, AND
@justinbieber PLEASE FOLLOWED ME TOO :(
Entities: [('@justinbieber', 'DATE'), ('TOO', 'ORG')]



In [43]:
# Create feature sets for training and testing
feature_sets = [(document_features(get_bigrams_and_unigrams(tweet)), label) for tweet, label in zip(lemmatized_tweets, labels)]
train_set, test_set = feature_sets[1000:], feature_sets[:1000]

In [44]:
# Train Naive Bayes Classifier - Text classification
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
nb_accuracy = nltk.classify.util.accuracy(nb_classifier, test_set)
print(f'Naive Bayes Accuracy: {nb_accuracy * 100:.2f}%')

Naive Bayes Accuracy: 99.90%


In [45]:
# Show most informative features
nb_classifier.show_most_informative_features(10)

Most Informative Features
           contains(:_() = 1.0            Negati : Positi =   1160.9 : 1.0
           contains(:_)) = 1.0            Positi : Negati =    920.9 : 1.0
           contains(-_)) = 1.0            Positi : Negati =    396.2 : 1.0
           contains((_() = 1.0            Negati : Positi =    249.1 : 1.0
        contains((_http) = 1.0            Negati : Positi =    223.8 : 1.0
           contains()_)) = 1.0            Positi : Negati =    164.3 : 1.0
        contains()_http) = 1.0            Positi : Negati =     76.2 : 1.0
             contains(() = 1.0            Negati : Positi =     64.8 : 1.0
             contains()) = 1.0            Positi : Negati =     62.3 : 1.0
      contains(follower) = 1.0            Positi : Negati =     47.0 : 1.0


In [46]:
# Train Decision Tree Classifier - Text classification
dt_classifier = nltk.SklearnClassifier(DecisionTreeClassifier()).train(train_set)
dt_accuracy = nltk.classify.util.accuracy(dt_classifier, test_set)
print(f'Decision Tree Accuracy: {dt_accuracy * 100:.2f}%')

Decision Tree Accuracy: 100.00%
