In [182]:
            
# General
import pandas as pd
import numpy as np

# For natural language processing
import regex as re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

# For logistic regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# For K-Means
from sklearn.cluster import KMeans, k_means
from sklearn.metrics import silhouette_score
from sklearn.datasets.samples_generator import make_blobs

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline

In [183]:
# making data frame from csv file  
df = pd.read_csv("all_disaster_tweets.csv") 

In [184]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,tweet_text,disaster_type
0,0,hoy es nuestro dia nerea andres diego g...,99.0
1,1,yeahh yeahh esa jenga tocayo pablo,99.0
2,2,a m e do meu amigo pablo morreu ( ta muito ...,99.0
3,3,madrinas y apadrinados santacecilia2012 pa...,99.0
4,4,11 26 12 e games attendant chada bingo hou...,99.0
5,5,el cumplea os de pablo escobar my dad ...,99.0
6,6,"st scholastica's college, manila suspends clas...",1.0
7,7,rt dynamic designed qrcode for cdo expan...,99.0
8,8,en el curso de primeros auxilios con alvar...,99.0
9,9,que grandes pablo,99.0


In [185]:
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)


In [186]:
df.head()

Unnamed: 0,tweet_text,disaster_type
0,hoy es nuestro dia nerea andres diego g...,99.0
1,yeahh yeahh esa jenga tocayo pablo,99.0
2,a m e do meu amigo pablo morreu ( ta muito ...,99.0
3,madrinas y apadrinados santacecilia2012 pa...,99.0
4,11 26 12 e games attendant chada bingo hou...,99.0


In [187]:
# TFID Vectorizer
def preprocess(s):
  lemmatizer = nltk.WordNetLemmatizer()
  return lemmatizer.lemmatize(s)

stop = set(stopwords.words('english'))


"""
# Stopwords part

def not_stopword(s):
  s = s.strip()
  v = stopwords.words('english')
  result = ""
  words = nltk.word_tokenize(s)
  for word in words:
    if word not in v:
      result += word + " "
  return result.strip()

i=0
for token in tokens:
  token = preprocess(token)
  
finalsentence = ' '.join(tweet.split())


print(finalsentence)
"""


vectorizer = TfidfVectorizer(stop_words=stop, analyzer='word', max_features=20000, dtype=np.float32, preprocessor=preprocess)

data = vectorizer.fit_transform(tweets).toarray()
testData = vectorizer.transform(testTweets).toarray()
print(type(data), data)

  'stop_words.' % sorted(inconsistent))


<class 'numpy.ndarray'> [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [188]:
df.isna().sum()

tweet_text        2
disaster_type    10
dtype: int64

In [189]:
df=df.dropna()

In [190]:
df.isna().sum()

tweet_text       0
disaster_type    0
dtype: int64

In [191]:
# Set features and target
X = df['tweet_text']
y = df['disaster_type']

In [192]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


In [193]:
#Bag of Words
vect = CountVectorizer()

# Fit the vectorizer on our corpus and transform
X_train_vect = vect.fit_transform(X_train)
X_train_vect = pd.DataFrame(X_train_vect.toarray(), columns=vect.get_feature_names())

# Transform the test set
X_test_vect = vect.transform(X_test)

# Transform entire set for K-Means clustering later
X_vect = vect.transform(X)

In [194]:
#Logistic Regression
#Use logistic regression to get words that are most likely to be from tweets made during power outages. 
#Cross-validation and accuracy scores help serve as a sanity check.

# Get baseline accuracy score
y_train.value_counts(normalize=True)[1]

0.06611609700210044

In [195]:
# Instantiate model
logreg = LogisticRegression(solver='liblinear')

# Fit on training data.
logreg.fit(X_train_vect, y_train)

# Get scores
print('CV score:', cross_val_score(logreg, X_train_vect, y_train, cv=3).mean())
print('Training accuracy:', logreg.score(X_train_vect, y_train))
print('Testing accuracy:', logreg.score(X_test_vect, y_test))




CV score: 0.9232382446588868
Training accuracy: 0.9767042199732672
Testing accuracy: 0.9238149792352857


In [196]:
# Create confusion matrix
predictions = logreg.predict(X_test_vect)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, columns=['predict neg', 'predict pos'], index=['actual neg', 'actual pos'])
cm_df


ValueError: Shape of passed values is (7, 7), indices imply (2, 2)

In [197]:
#Assosiated words
# Create dataframe with coefs and e^coefs for each word
coefs = list(zip(vect.get_feature_names(), logreg.coef_[0].T))
coefs = pd.DataFrame(coefs, columns = ['word','coef'])
coefs['e^coef'] = np.exp(coefs['coef'])


In [198]:
# Show words most associated with tweets 
coefs.sort_values(by='e^coef', ascending=False).head(20)

Unnamed: 0,word,coef,e^coef
12463,highparkfire,5.399164,221.221326
17874,nswfires,5.06134,157.801773
22623,santamaria,4.847461,127.416454
10580,fires,4.003775,54.804671
27969,wildfires,3.653322,38.602695
27968,wildfire,3.556587,35.043385
10559,fire,3.324158,27.775593
17870,nswbushfires,2.941881,18.951466
17593,nightclub,2.728926,15.316428
17869,nsw,2.724221,15.244541


In [199]:
#Explore tweets with specific words
# Function to get full tweets and count of tweets containing specific words
def get_tweets(word):
    mask = df[df['tweet_text'].str.contains(f'(^|\W){word}($|\W)')].index
    count = 0
    for i in mask:
        count += 1
        print(i, df['tweet_text'][i], '\n')
    print(f'\nTotal tweets containing "{word}": {count}')

In [None]:
get_tweets('typhoon')


In [None]:
#K-Means Clustering
kmeans = KMeans(n_clusters=3)
model = kmeans.fit(X_vect)


In [200]:
# Attach predicted cluster to dataframe
df['predictions'] = model.labels_
df.head(3)

Unnamed: 0,tweet_text,disaster_type,predictions
0,hoy es nuestro dia nerea andres diego g...,99.0,1
1,yeahh yeahh esa jenga tocayo pablo,99.0,1
2,a m e do meu amigo pablo morreu ( ta muito ...,99.0,1


In [201]:
df.loc[df['predictions_label'] == 0, 'tweet_text']

KeyError: 'predictions_label'

In [169]:
df.loc[df['predictions_label'] == 1, 'tweet_text']


0        hoy es nuestro dia    nerea  andres   diego  g...
1                  yeahh yeahh esa jenga    tocayo  pablo 
2        a m e do meu amigo  pablo  morreu  ( ta muito ...
3        madrinas y apadrinados  santacecilia2012    pa...
4        11 26 12   e games attendant   chada bingo hou...
                               ...                        
27937    coalminers escape heavy flood damage  queensla...
27938    donations to queensland flood appeal fall shor...
27939    emergency payments to flood victims in qld wil...
27940     qldpol  auspol  qld's flood appeal committee ...
27941    new post  after heatwaves and flooding, januar...
Name: tweet_text, Length: 18308, dtype: object

In [170]:
## Create a dataframe for cluster_centers (centroids)
centroids = pd.DataFrame(model.cluster_centers_)
centroids.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28438,28439,28440,28441,28442,28443,28444,28445,28446,28447
0,0.001561,0.013337,0.0,0.0,0.0,0.000142,0.0,0.0,0.0,0.000851,...,0.0,0.0,0.0,0.0,0.000142,0.0,0.0,0.0,0.0,0.0
1,0.003878,0.012672,0.000109,5.5e-05,0.000273,0.0,5.5e-05,0.000109,5.5e-05,0.000492,...,0.0,0.0,5.5e-05,5.5e-05,0.0,0.000109,0.000109,0.000109,5.5e-05,5.5e-05
2,0.006214,0.003495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000388,0.001165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
df.head()

Unnamed: 0,tweet_text,disaster_type,predictions
0,hoy es nuestro dia nerea andres diego g...,99.0,1
1,yeahh yeahh esa jenga tocayo pablo,99.0,1
2,a m e do meu amigo pablo morreu ( ta muito ...,99.0,1
3,madrinas y apadrinados santacecilia2012 pa...,99.0,1
4,11 26 12 e games attendant chada bingo hou...,99.0,1


In [174]:
df = df.groupby(['disaster_type' , 'predictions']) 
  

Unnamed: 0,tweet_text,disaster_type,predictions
0,hoy es nuestro dia nerea andres diego g...,99.0,1
1,yeahh yeahh esa jenga tocayo pablo,99.0,1
2,a m e do meu amigo pablo morreu ( ta muito ...,99.0,1
3,madrinas y apadrinados santacecilia2012 pa...,99.0,1
4,11 26 12 e games attendant chada bingo hou...,99.0,1
...,...,...,...
13108,death toll in brazil nightclub fire rises to 1...,0.0,0
13158,what a terrible thing happened in santamaria ...,0.0,0
13195,rt video firefighters and civilians battle t...,0.0,0
24895,rt emergenza olbia hotel de plam ha solo una...,4.0,2
