***Problem Statement***

**To clean and preprocess a mental health-related Twitter dataset for accurate sentiment analysis in future model development.**

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Our column names were missing at the start

column_names = ['ID', 'Topic', 'Sentiment', 'Tweet Content']

# assign column names
data = pd.read_csv('twitter.csv', names=column_names, header=None)

print(data.head(10))



     ID        Topic Sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   
5  2401  Borderlands  Positive   
6  2402  Borderlands  Positive   
7  2402  Borderlands  Positive   
8  2402  Borderlands  Positive   
9  2402  Borderlands  Positive   

                                       Tweet Content  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
5  im getting into borderlands and i can murder y...  
6  So I spent a few hours making something for fu...  
7  So I spent a couple of hours doing something f...  
8  So I spent a few hours doing something for fun...  
9  So I spent a few hours making something for fu...  


In [11]:
data.columns

Index(['ID', 'Topic', 'Sentiment', 'Tweet Content'], dtype='object')

In [12]:
# Exploratory data analysis
data.dtypes

ID                int64
Topic            object
Sentiment        object
Tweet Content    object
dtype: object

In [13]:
data.describe()

Unnamed: 0,ID
count,74682.0
mean,6432.586165
std,3740.42787
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             74682 non-null  int64 
 1   Topic          74682 non-null  object
 2   Sentiment      74682 non-null  object
 3   Tweet Content  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [15]:
# Cleaning
data.isna().any().any()
# This means there is NAN present int the dataset

np.True_

In [16]:
# Drop rows with missing or invalid data
data = data.dropna(subset=['Tweet Content', 'Sentiment'])

# Ensure numeric conversion for plotting
data = data[data['Sentiment'].notna()]




## Data Cleaning ##

In [17]:
import re

def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+', '', tweet)
    
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)
    
    # Remove non-alphabetical characters (punctuation, numbers, etc.)
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
    
    # Convert text to lowercase
    tweet = tweet.lower()
    
    # Remove stopwords
    stop_words = set(['a', 'an', 'the', 'in', 'on', 'at', 'for', 'to', 'of', 'is', 'are', 'am', 'was', 'were', 'be'])
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    
    return tweet

# Apply the function to the 'Tweet Content' column
data['Cleaned Tweet'] = data['Tweet Content'].apply(clean_tweet)

# Check the cleaned tweets
print(data[['Tweet Content', 'Cleaned Tweet']].head())


                                       Tweet Content  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

                                       Cleaned Tweet  
0   im getting borderlands and i will murder you all  
1           i coming borders and i will kill you all  
2     im getting borderlands and i will kill you all  
3    im coming borderlands and i will murder you all  
4  im getting borderlands and i will murder you m...  


## Data Visualisation ## 

In [18]:
import matplotlib.pyplot as plt

# Scatter Plot
plt.figure(figsize=(10, 6))
plt.scatter(data['Tweet Content'], data['Sentiment'], alpha=0.5, color='blue')

# Add titles and labels
plt.title('Sentiment Score vs Tweet Content')
plt.xlabel('Tweet')
plt.ylabel('Sentiment Score')
plt.grid(True)
plt.show()



  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)
  fig.canvas.print_figure(bytes_io, **kw)


ValueError: 
fyi, your algorithms are failing to award credit. . Despite the actual credit that I have, because I have never subjugated myself to the system. . Amazon won't even lend me $$ at 27%. Think about that!. . Apple won't either. . Everyone already knows @Equifax is compromised, so . 🤟. 🤳. 🤷. ‍.  pic.twitter.com/1tNmB1pVOW
                                                                                                                                                                             ^
ParseException: Expected end of text, found '$'  (at char 173), (line:1, col:174)

<Figure size 1000x600 with 1 Axes>