## 1. Become one with the data

In [None]:
!wget https://raw.githubusercontent.com/varmatilak22/data/refs/heads/main/tweets.csv

--2024-10-16 12:14:04--  https://raw.githubusercontent.com/varmatilak22/data/refs/heads/main/tweets.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1615005 (1.5M) [text/plain]
Saving to: ‘tweets.csv’


2024-10-16 12:14:05 (22.7 MB/s) - ‘tweets.csv’ saved [1615005/1615005]



In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/tweets.csv')

# Inspect the data
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [None]:
df.shape

(11370, 5)

## 2. Text Preprocessing

In [None]:
import re
from nltk.corpus import stopwords

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Apply cleaning
df['text'] = df['text'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,communal violence in bhainsa telangana stones ...,1
1,1,ablaze,,telangana section 144 has been imposed in bhai...,1
2,2,ablaze,New York City,arsonist sets cars ablaze at dealership,1
3,3,ablaze,"Morgantown, WV",arsonist sets cars ablaze at dealership,1
4,4,ablaze,,lord jesus your love brings freedom and pardon...,0


## 3. Tokenisation and padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post')

## 4. Train test split

In [None]:
from sklearn.model_selection import train_test_split

X = padded_sequences
y = df['target'].values  # Target labels

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Model Building(LSTM)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=100))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

## 6. Training the model

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=64)

Epoch 1/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 282ms/step - accuracy: 0.8136 - loss: 0.5115 - val_accuracy: 0.8259 - val_loss: 0.4698
Epoch 2/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 292ms/step - accuracy: 0.8078 - loss: 0.4942 - val_accuracy: 0.8259 - val_loss: 0.4772
Epoch 3/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 237ms/step - accuracy: 0.8094 - loss: 0.4909 - val_accuracy: 0.8259 - val_loss: 0.4647
Epoch 4/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 269ms/step - accuracy: 0.8085 - loss: 0.4934 - val_accuracy: 0.8259 - val_loss: 0.4636
Epoch 5/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 255ms/step - accuracy: 0.8098 - loss: 0.4891 - val_accuracy: 0.8259 - val_loss: 0.4641


## 7. Model Evaluation

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 67ms/step - accuracy: 0.8245 - loss: 0.4660
Test Loss: 0.4641016125679016
Test Accuracy: 0.8258575201034546


## 8. Make predictions

In [None]:
def predict_disaster(tweet, model, tokenizer, max_len=100):
    tweet_seq = tokenizer.texts_to_sequences([tweet])
    padded_tweet = pad_sequences(tweet_seq, maxlen=max_len, padding='post')
    prediction = model.predict(padded_tweet)
    return 'Disaster' if prediction > 0.5 else 'Not Disaster'

# Example usage
example_tweet = "Had a wonderful weekend with family and friends!"
result = predict_disaster(example_tweet, model, tokenizer)
print(f'Prediction: {result}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Prediction: Not Disaster
