In [1]:
#for datawrangling and manipulation

import pandas as pd
import numpy as np

#for NLP text processing and formatting

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# For word lemmitization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# for word Stemming
from nltk.stem.porter import PorterStemmer

# for Machine Learning process

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# for Machine Learning model evaluation

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


# Global Parameters
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to C:\Users\Lenovo
[nltk_data]     SSD\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def preprocess_tweet_text(tweet):
    """
    Function to process the the tweet text and tranform it into format usable by Machine learning models
    """
    
    # to convert all the characters of the tweet into lower case alphabets
    tweet.lower()
    
    # Remove urls from the tweets
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove user related references from the tweets:: '@' and '#' 
    tweet = re.sub(r'\@\w+|\#','', tweet)
    
    # Remove punctuations from the tweets
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords from the tweets
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    joined_text = " ".join(filtered_words)
    
    return joined_text

In [15]:
def get_feature_vector(train_fit):
    """
    Function to Convert a collection of raw documents to a matrix of TF-IDF features.
    TF-IDF - Term Frequency Inverse Documnet Frequency
    """
    
    vector = TfidfVectorizer(sublinear_tf=True)      # Defining the vector
    vector.fit(train_fit)                            # fitting the data into the vector
    return vector                                    # returning the vector as function call

# Importing the Dataset ::

In [5]:
dataset = pd.read_csv(r"C:\Users\Lenovo SSD\Desktop\twitter project\train.txt")
dataset

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...,...
21460,522949024132112384,neutral,"the day after newark ill be able to say """"i me..."
21461,522372593312350209,neutral,FEC hold farewell session for seven ministers ...
21462,522515200592052224,neutral,Luca Di Montezemolo (who's last day was Monday...
21463,523089087155437568,positive,Coffee is pretty much the answer to all questi...


# Preprocessing the Data ::

In [6]:
#Preprocess data before feeding it to ML models

processed_text = dataset['tweet_text'].apply(preprocess_tweet_text)
print("Processed text :: \n\n", processed_text)

Processed text :: 

 0          Gas house hit 339 Iu2019m going Chapel Hill Sat
1        Theo Walcott still shitu002c watch Rafa Johnny...
2        Iu2019m GSP fanu002c hate Nick Diaz canu2019t ...
3        Iranian general says Israelu2019s Iron Dome ca...
4        Tehranu002c Mon Amour Obama Tried Establish Ti...
                               ...                        
21460    day newark ill able say met demi lovato yester...
21461    FEC hold farewell session seven ministers Pres...
21462    Luca Di Montezemolo whos last day Monday Alons...
21463    Coffee pretty much answer questions today Frid...
21464    Niki Lauda confirmed Sky Alonso released conta...
Name: tweet_text, Length: 21465, dtype: object


In [7]:
# Saving the processed data into a new dataframe

processed_df = pd.DataFrame(processed_text, columns = ['tweet_text'])
processed_df

Unnamed: 0,tweet_text
0,Gas house hit 339 Iu2019m going Chapel Hill Sat
1,Theo Walcott still shitu002c watch Rafa Johnny...
2,Iu2019m GSP fanu002c hate Nick Diaz canu2019t ...
3,Iranian general says Israelu2019s Iron Dome ca...
4,Tehranu002c Mon Amour Obama Tried Establish Ti...
...,...
21460,day newark ill able say met demi lovato yester...
21461,FEC hold farewell session seven ministers Pres...
21462,Luca Di Montezemolo whos last day Monday Alons...
21463,Coffee pretty much answer questions today Frid...


# Performing Stemming ::

In [8]:
def stemming(text):
    """
    Function to perform stemming of the tweet text and return the processed stemmed text
    """
    stemmer = PorterStemmer()           # defining the porter stemming 
    s_words = stemmer.stem(text)        # stemming the text and saving the text in a new variable
    
    return s_words

### Feeding the processed text into the model and creating a new dataframe consisting of the new processed text

In [9]:
stemmed_words = pd.DataFrame(columns = ['tweet_text']).apply(stemming)                          # Applying the function
stemmed_df = pd.DataFrame(stemmed_words, columns = ['tweet_text'])        #creating the dataframe   

# Performing Lemmatization ::

In [10]:
def lemmatization(text):
    """
    """
    lemmatizer = WordNetLemmatizer()
    l_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return l_words

### Feeding the processed text into the model and creating a new dataframe consisting of the new processed text

In [11]:
lemma_words = stemmed_df['tweet_text'].apply(stemming)                      # Applying the function
lemmed_df = pd.DataFrame(lemma_words, columns = ['tweet_text'])             #creating the dataframe
lemmed_df['sentiment'] = dataset['sentiment']

# Performing Vectorization ::

In [16]:
tf_vector = get_feature_vector(np.array(dataset["tweet_text"]).ravel())

# Defining Predictor and Target Variables :

In [18]:
# Predictor Variable

X = tf_vector.transform(np.array(dataset["tweet_text"]).ravel())


# Target Variable

y = np.array(dataset["sentiment"]).ravel()


# Splitting the data into Training and Testing data ::

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

### hape of the Training and Testing data

In [20]:
print("Shape of the Training Predictor variable :: ", X_train.shape )
print("\nShape of the Test Predictor variable :: ", X_test.shape)
print("\nShape of the Training Target variable :: ", y_train.shape)
print("\nShape of the Test Target variable :: ", y_test.shape)

Shape of the Training Predictor variable ::  (17172, 42138)

Shape of the Test Predictor variable ::  (4293, 42138)

Shape of the Training Target variable ::  (17172,)

Shape of the Test Target variable ::  (4293,)


# Defining the Naive Bayes ML Model :: 

In [21]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)                            # fitting the data into the model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Predicting the values ::

In [22]:
naive_predict = naive_bayes_model.predict(X_test)

# Checking the accuracy of the model

In [23]:
print("Accuracy Score of the Naive Bayes Model is :: ", accuracy_score(y_test, naive_predict))

Accuracy Score of the Naive Bayes Model is ::  0.5918937805730259


### Classification Report of the Model ::

In [24]:
print("Classification_Report :: \n\n", classification_report(y_test, naive_predict))

Classification_Report :: 

               precision    recall  f1-score   support

    negative       1.00      0.00      0.00       676
     neutral       0.60      0.60      0.60      1809
    positive       0.58      0.81      0.68      1808

    accuracy                           0.59      4293
   macro avg       0.73      0.47      0.43      4293
weighted avg       0.66      0.59      0.54      4293



In [25]:
def get_tweet_sentiment(df): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(df) 
        
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 'positive'
        elif analysis.sentiment.polarity == 0: 
            return 'neutral'
        else: 
            return 'negative'

In [26]:
test = pd.read_csv("E:/Data Science/CSV dataset/Competition/test_samples.txt")
test

FileNotFoundError: [Errno 2] File b'E:/Data Science/CSV dataset/Competition/test_samples.txt' does not exist: b'E:/Data Science/CSV dataset/Competition/test_samples.txt'

In [22]:

processed_text2 = test['tweet_text'].apply(preprocess_tweet_text)             # Applying the function
processed_df2 = pd.DataFrame(processed_text2, columns = ['tweet_text'])       #creating the dataframe 
stemmed_words2 = test['tweet_text'].apply(stemming)                          # Applying the function
stemmed_df2 = pd.DataFrame(stemmed_words2, columns = ['tweet_text'])        #creating the dataframe 

lemma_words2 = stemmed_df2['tweet_text'].apply(stemming)                      # Applying the function
lemmed_df2 = pd.DataFrame(lemma_words2, columns = ['tweet_text'])             #creating the dataframe
lemmed_df2

NameError: name 'test' is not defined

In [23]:
predict = lemmed_df2['tweet_text'].apply(get_tweet_sentiment)

NameError: name 'lemmed_df2' is not defined

In [24]:
print("Accuracy Score of the Naive Bayes Model is :: ", accuracy_score(y_test, predict))

NameError: name 'y_test' is not defined

In [25]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)                            # fitting the data into the model
naive_predict = naive_bayes_model.predict(X_test)

NameError: name 'X_train' is not defined

In [26]:
# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [27]:
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

NameError: name 'X_test' is not defined

In [28]:
from sklearn.metrics import classification_report

print("Classification_Report :: \n\n", classification_report(y_test, y_predict_lr))

NameError: name 'y_test' is not defined