***Problem Statement***

**To clean and preprocess a mental health-related Twitter dataset for accurate sentiment analysis in future model development.**

In [41]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Our column names were missing at the start

column_names = ['ID', 'Topic', 'Sentiment', 'Tweet Content']

# assign column names
data = pd.read_csv('twitter.csv', names=column_names, header=None)

print(data.head(10))



     ID        Topic Sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   
5  2401  Borderlands  Positive   
6  2402  Borderlands  Positive   
7  2402  Borderlands  Positive   
8  2402  Borderlands  Positive   
9  2402  Borderlands  Positive   

                                       Tweet Content  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
5  im getting into borderlands and i can murder y...  
6  So I spent a few hours making something for fu...  
7  So I spent a couple of hours doing something f...  
8  So I spent a few hours doing something for fun...  
9  So I spent a few hours making something for fu...  


In [42]:
data.columns

Index(['ID', 'Topic', 'Sentiment', 'Tweet Content'], dtype='object')

In [43]:
# Exploratory data analysis
data.dtypes

ID                int64
Topic            object
Sentiment        object
Tweet Content    object
dtype: object

In [44]:
data.describe()

Unnamed: 0,ID
count,74682.0
mean,6432.586165
std,3740.42787
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             74682 non-null  int64 
 1   Topic          74682 non-null  object
 2   Sentiment      74682 non-null  object
 3   Tweet Content  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


## Data Cleaning ##

In [46]:

#  missing values
rows_with_nan = data[data.isna().any(axis=1)]  # Rows containing NaN values

# duplicate rows
duplicate_rows = data[data.duplicated(subset=['Tweet Content'], keep='first')]

# rows with invalid sentiment values 
valid_sentiments = ['positive', 'neutral', 'negative']
invalid_sentiments = data[~data['Sentiment'].isin(valid_sentiments)]

# Combine all filtered rows f
filtered = pd.concat([rows_with_nan, duplicate_rows, invalid_sentiments]).drop_duplicates()

# removing filtered rows and test
data_cleaned = data.drop(filtered.index)

print(data_cleaned.isin(filtered).any().any())  # Should hopefully return False 



False


In [47]:
import re

def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+', '', tweet)
    
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)
    
    # Remove non-alphabetical characters (punctuation, numbers, etc.)
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
    
    # Convert text to lowercase
    tweet = tweet.lower()
    
    # Remove stopwords
    stop_words = set(['a', 'an', 'the', 'in', 'on', 'at', 'for', 'to', 'of', 'is', 'are', 'am', 'was', 'were', 'be'])
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    
    return tweet

# Apply the function to column
data['Cleaned Tweet'] = data['Tweet Content'].apply(clean_tweet)

# Check the cleaned tweets
print(data[['Tweet Content', 'Cleaned Tweet']].head())


TypeError: expected string or bytes-like object, got 'float'

## Data Visualisation ## 

In [48]:
import matplotlib.pyplot as plt

# Scatter Plot
plt.figure(figsize=(10, 6))
plt.scatter(data['Cleaned Tweet'], data['Sentiment'], alpha=0.5, color='blue')

# Add titles and labels
plt.title('Sentiment Score vs Tweet Content')
plt.xlabel('Tweet')
plt.ylabel('Sentiment Score')
plt.grid(True)
plt.show()



KeyError: 'Cleaned Tweet'

<Figure size 1000x600 with 0 Axes>