<a href="https://colab.research.google.com/github/tonysarre/extract/blob/main/FutureIntern_DS_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Step 1: Load Necessary Libraries

In [14]:
pip install pandas numpy scikit-learn nltk




In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


###Step 2: Load the Dataset

In [16]:
# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/Future Intern/tweet/Tweets.csv")  # Update the path accordingly

# Display the first few rows of the dataset
print(data.head())


             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

###Step 3: Data Exploration and Preprocessing

In [17]:
# Check the shape and columns of the dataset
print(data.shape)
print(data.columns)

# Check for missing values
print(data.isnull().sum())


(14640, 15)
Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')
tweet_id                            0
airline_sentiment                   0
airline_sentiment_confidence        0
negativereason                   5462
negativereason_confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_location                   4733
user_timezone                    4820
dtype: int64


In [18]:
# Data Cleaning - Keep only the necessary columns and remove rows with missing values
data_cleaned = data[['text', 'airline_sentiment']].dropna()

# Check for missing values in the cleaned dataset
print(data_cleaned.isnull().sum())

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

text                 0
airline_sentiment    0
dtype: int64


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
# Define text preprocessing function
def preprocess_text(text):
    # Remove URLs and mentions
    text = re.sub(r'http\S+|www\S+|@\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase and strip
    text = text.lower().strip()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [22]:
# Apply the preprocessing to the text column
data_cleaned['cleaned_text'] = data_cleaned['text'].apply(preprocess_text)


In [23]:
# Check the cleaned text
print(data_cleaned['cleaned_text'].head())


0                                                 said
1        plus youve added commercials experience tacky
2         didnt today must mean need take another trip
3    really aggressive blast obnoxious entertainmen...
4                                 really big bad thing
Name: cleaned_text, dtype: object


In [25]:
# Feature Extraction - Convert text to numerical form using CountVectorizer
vectorizer = CountVectorizer()

# Transform the text data into a bag-of-words representation
X = vectorizer.fit_transform(data_cleaned['cleaned_text'])

# Target variable (airline sentiment)
y = data_cleaned['airline_sentiment']

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7807377049180327
              precision    recall  f1-score   support

    negative       0.80      0.93      0.86      1889
     neutral       0.66      0.41      0.51       580
    positive       0.76      0.62      0.68       459

    accuracy                           0.78      2928
   macro avg       0.74      0.65      0.68      2928
weighted avg       0.77      0.78      0.76      2928

