In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-deep')
from sklearn.metrics import confusion_matrix
import nltk
# nltk.download('stopwords')

In [None]:
# Load the first dataset and shuffle the rows
dataframe1 = pd.read_csv('twitter_validation.csv', encoding='latin1')
dataframe1 = dataframe1.sample(frac=1)
dataframe1

# **Exploratory Data Analysis**

In [None]:
print(dataframe1.columns[-1])

In [None]:
dataframe1 = dataframe1.rename(columns={'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tomâs great auntie as âHayley canât get out of bedâ and told to his grandma, who now thinks Iâm a lazy, terrible person ð¤£':'text'})
dataframe1

In [None]:
dataframe1.shape

In [None]:
dataframe1.isna()

In [None]:
dataframe1.isna().sum()

In [None]:
# Create a heatmap of missing values
plt.figure(figsize=(10, 6))
custom_cmap = sns.color_palette(["#FF1493", "#D3D3D3"])
sns.heatmap(dataframe1.isnull(), cmap=custom_cmap, cbar=False, yticklabels=False)
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
class_distribution_Facebook = dataframe1['Facebook'].value_counts()
print(class_distribution_Facebook)

In [None]:
# Assuming binary classification
ratio_majority_minority_Facebook = class_distribution_Facebook[0] / class_distribution_Facebook[1]
print(f"Ratio between majority and minority classes: {ratio_majority_minority_Facebook}")

In [None]:
import matplotlib.pyplot as plt

# Plot class distribution
class_distribution_Facebook.plot(kind='bar', title='Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
 class_distribution_Irrelevant = dataframe1['Irrelevant'].value_counts()
print(class_distribution_Irrelevant)

In [None]:
# Assuming binary classification
ratio_majority_minority_Irrelevant = class_distribution_Irrelevant[0] / class_distribution_Irrelevant[1]
print(f"Ratio between majority and minority classes: {ratio_majority_minority_Irrelevant}")

In [None]:
import matplotlib.pyplot as plt

# Plot class distribution
class_distribution_Irrelevant.plot(kind='bar', title='Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
set(dataframe1[['Facebook']].values.ravel()) # set() fetch the unique data only

In [None]:
dataframe1.groupby('Facebook').Facebook.count()

In [None]:
set(dataframe1['Irrelevant'].unique()) # set() fetch the unique data only

In [None]:
dataframe1.groupby('Irrelevant').Irrelevant.count()

In [None]:
#Analyzing data
dataframe1.groupby('Facebook').Facebook.count().plot.bar()
plt.show()

In [None]:
#Analyzing data
dataframe1.groupby('Irrelevant').Irrelevant.count().plot.bar()
plt.show()

In [None]:
# Group by 'Borderlands' and 'Positive' columns and count occurrences
grouped_data = dataframe1.groupby(['Facebook', 'Irrelevant']).size().unstack()
grouped_data

In [None]:
# Group by 'Borderlands' and 'Positive' columns and count occurrences
grouped_data = dataframe1.groupby(['Facebook', 'Irrelevant']).size().unstack()

# Plotting
grouped_data.plot(kind='bar', stacked=True)
plt.show()

Let's make a new column to detect how long the text messages are

In [None]:
dataframe1['Length'] = dataframe1['text'].apply(len)
dataframe1

Let's see the percentage of ham and spam in our dataset

In [None]:
dataframe1['Length'].describe()

# **Text Cleaning**

Let’s clean the text for the messages in our dataset with NLP.

In [None]:
import string
from nltk.corpus import stopwords
string.punctuation

In [None]:
mess = '''For me / @the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later..'''

In [None]:
nopunc = ''.join([char for char in mess if char not in string.punctuation])
print(nopunc)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords.words('english')

Let's create the function to remove all punctuation, remove all stopwords and returns a list of the cleaned text

In [None]:
def text_process(message):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in message if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    # Now just remove any stopwords
    # split() convert data into list
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')] 

**Vectorization**

***CountVectorizer :*** CountVectorizer is a feature extraction technique used to convert a collection of text documents to a matrix of token counts.

Now we have the messages as lists and we need to convert each of those messages into a vector that SciKit Learn's algorithm models can work with.

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
## text_process function is calling by passing parameter dataframe1['text']
bow_transformer = CountVectorizer(analyzer=text_process).fit(dataframe1['text'])
bow_transformer

In [None]:
bow_transformer.vocabulary_

In [None]:
print(len(bow_transformer.vocabulary_))

In [None]:
message4 = dataframe1['text'][3]
print(message4)

Now let's transform the entire DataFrame of messages and create sparse matrix

In [None]:
messages_bow = bow_transformer.transform(dataframe1['text'])
messages_bow

In [None]:
print('Shape of Sparse Matrix: ', messages_bow.shape)

# **TF-IDF**

Now let's compute term weighting and do normalisation with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(messages_bow)
print(tfidf4)

In [None]:
print(tfidf4.shape)

In [None]:
# from sklearn.svm import SVC
# from datetime import datetime
# start_time = datetime.now()

# model = SVC()
# model.fit(x_train,y_train)

# end_time = datetime.now()
# process_time = round(end_time-start_time,2)
# print("Fitting SVC took {} seconds".format(process_time))

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split

text_train, text_test, label_train, label_test = train_test_split(dataframe1['text'], dataframe1['Irrelevant'], test_size=0.2,shuffle=True)



**Creating a Data Pipeline**

Let's run our model again and then predict the test set. We will create and use a pipeline for this purpose

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SVC()),  # train on TF-IDF vectors w/ SVM      # accuracy = 0.54
#     ('clf', BalancedRandomForestClassifier(n_estimators=100, random_state=42)) # Accuracy: 0.475
])

In [None]:
pipeline.fit(text_train,label_train)

In [None]:
text_test.iloc[1:2].values

In [None]:
predictions = pipeline.predict(text_test)
predictions

In [None]:
class_names = np.array(['Irrelevant', 'Negative', 'Neutral', 'Positive'])

# **Making Confusion Matrix**

Confusion Matrix is going to contain the correct predictions that our model made on the set as well as the incorrect predictions.

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(label_test,predictions)
print(cm)

In [None]:
# confusion matrix and classification report(precision, recall, F1-score)
# ytest = np.array(label_test)
print(classification_report(pipeline.predict(text_test),label_test))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
class_names =['Irrelevant', 'Negative', 'Neutral', 'Positive']
# Change figure size and increase dpi for better resolution
# and get reference to axes object
fig, ax = plt.subplots(figsize=(8,8), dpi=100)

# initialize using the raw 2D confusion matrix
# and output labels (in our case, it's 0 and 1)
display = ConfusionMatrixDisplay(cm, display_labels=class_names)

# set the plot title using the axes object
ax.set(title='Confusion Matrix for the Diabetes Detection Model')

# show the plot.
# Pass the parameter ax to show customizations (ex. title)
display.plot(ax=ax);

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(predictions,label_test))

In [None]:
# file = open('/content/news.txt','r')
# news = file.read()
# file.close()

sentiment = input("Enter sentiment = ")
sentiment_data = {'predict_sentiment':[sentiment]}
sentiment_data_df = pd.DataFrame(sentiment_data)

predict_sentiment_cat = pipeline.predict(sentiment_data_df['predict_sentiment'])
print("Predicted news category = ",predict_sentiment_cat[0])