In [1]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use mps:0


In [3]:
text = ["try again, lol"]
labels = [
  "grateful", 
  "political", 
  "frustration", 
  "solution focused", 
  "informative", 
  "news focused", 
  "fearful or panicked",
  "blaming", 
  "seeking help or advice", 
  "prescribed burn", 
  "wildfire", 
  "fire management"
]

results = classifier(text,labels,multi_label=True)
print(results)

[{'sequence': 'try again, lol', 'labels': ['frustration', 'seeking help or advice', 'blaming', 'fearful or panicked', 'political', 'fire management', 'informative', 'solution focused', 'wildfire', 'news focused', 'prescribed burn', 'grateful'], 'scores': [0.9054862856864929, 0.671358048915863, 0.5543333292007446, 0.4060872495174408, 0.23276542127132416, 0.1671888828277588, 0.1189441904425621, 0.03757259249687195, 0.02754698134958744, 0.025259803980588913, 0.014509734697639942, 0.008845468051731586]}]


In [5]:
data_csv = pd.read_csv("word_cloud_3_regions.csv")
text_data = list(data_csv['text'])
text_data

['Los Angeles Wildfires - The Solution:',
 '“I’m signing a declaration that water will be the wettest it’s ever been. And fire will be less hot in California.”',
 'He should sign an executive order banning natural disasters nationwide.',
 'ah yes. The north to south flow of water wile e coyote blocked with the giant acme valve.',
 'I long for the good old days when we naively thought Bush was embarrassing.',
 "This is on us. One of us should have remembered to just turn the water on! Thank god he reminded us. Problem solved, good job everyone, i'll write a note so we don't forget again.",
 'It’s that easy, huh?',
 '“I’m signing an executive order to not do anything :)”',
 'This is the onion right?',
 'This is truly the friggen dumbest MF God ever created.',
 'Where is he today?  I have time to go boo him.',
 'Can I still shower and so forth when they open the valves? Or am I going to lose pressure? \n\nAsking for a friend',
 'Just turn on the faucet. Mind blown.',
 'When the water does

In [6]:
import re 

def clean_text(full_text):
    clean_full_text = []

    for text in full_text:
        lower_case = re.sub(r'(.*)', lambda m: m.group(1).lower(), text)
        clean_text = re.sub(r'http\S+', '', lower_case)
        clean_text_2 = re.sub(r'www\S+', '', clean_text)
        clean_text_3 = re.sub(r'deleted', '', clean_text_2)
        clean_text_4 = re.sub(r'removed', '', clean_text_3)
        clean_full_text.append(clean_text_4)

    return clean_full_text 

In [9]:
clean_reddit_text = clean_text(text_data)

In [15]:
clean_reddit_text = [text for text in clean_reddit_text if text.strip()]
len(clean_reddit_text[0:10])

10

In [22]:
results_batch_1 = classifier(clean_reddit_text[0:1000], candidate_labels=labels, multi_label=True)

In [23]:
results_batch_2 = classifier(clean_reddit_text[1001:2000], candidate_labels=labels, multi_label=True)

In [24]:
results_batch_3 = classifier(clean_reddit_text[2001:3000], candidate_labels=labels, multi_label=True)

In [53]:
# throw results in a csv so that I dont have to run the model again. 
# takes in a list of batches

def create_reddit_dict(batches):
    text_dict = {}
    
    for batch in batches:
        for row in batch:
            text = row['sequence']

            count = 0
            label_set = []
            for score in row['scores']:
                
                if score >= 0.9:
                    label_set.append(row['labels'][count])
                else:
                    break

                count += 1
                
            text_dict[text] = label_set

    return text_dict

In [54]:
batches = [results_batch_1, results_batch_2, results_batch_3]

In [64]:
labeled_text = create_reddit_dict(batches)

In [84]:
# throw dictionary into csv

def convert_to_csv(text_dict):
    pd_dataframe = pd.DataFrame(list(text_dict.items()), columns=["text", "labels"])
    return pd_dataframe

In [86]:
pandas_data_text_labels = convert_to_csv(labeled_text)

In [87]:
pandas_data_text_labels.to_csv("BERT_zeroshot_labels.csv", index=False)

In [93]:
seattle_dataframe = pd.read_csv("seattle_fire_text.csv")
spokane_dataframe = pd.read_csv("spokane_fire_text.csv")

seattle_text = list(seattle_dataframe['text'])
spokane_text = list(spokane_dataframe['text'])

clean_seattle_text = clean_text(seattle_text)
clean_spokane_text = clean_text(spokane_text)

clean_seattle_text = [text for text in clean_seattle_text if text.strip()]
clean_spokane_text = [text for text in clean_spokane_text if text.strip()]

print("seattle length: ", len(clean_seattle_text))
print("spokane length: ", len(clean_spokane_text))

seattle length:  3175
spokane length:  1250


In [94]:
results_batch_spokane = classifier(clean_spokane_text[0:1000], candidate_labels=labels, multi_label=True)

In [95]:
results_batch_seattle_1 = classifier(clean_seattle_text[0:1000], candidate_labels=labels, multi_label=True)

In [96]:
results_batch_seattle_2 = classifier(clean_seattle_text[1001:2000], candidate_labels=labels, multi_label=True)

In [97]:
results_batch_seattle_3 = classifier(clean_seattle_text[2001:3000], candidate_labels=labels, multi_label=True)

In [98]:
batches_2 = [results_batch_spokane, results_batch_seattle_1, results_batch_seattle_2, results_batch_seattle_3]

In [99]:
labeled_text_2 = create_reddit_dict(batches_2)

In [100]:
pandas_data_text_labels_2 = convert_to_csv(labeled_text_2)

In [101]:
pandas_data_text_labels_2.to_csv("BERT_zeroshot_spokane_seattle_labels.csv", index=False)