
# Approach 1: Decision Tree

## The idea:
The input is target/topic and the statment and the output is for/against/neutral

Data pre-processing: tokenizer to convert words into text 
+ topics into tokens 
+ emotion of statement to token

There will be a model for each topic/target that will be put into the function. 

For targets that do not exist in the dataset:
+ The first scenario is to look for close by topics and predict based on that
+ The second scenario is to generate more data using OpenAI!

## Installs and Imports

In [None]:
!pip install -q spacy
!pip install -q scikit-learn
!pp install -q gensim
!pip install -q openai
!python -m spacy download en_core_web_sm

In [2]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import spacy
import pandas as pd
from scipy import spatial
import gensim.downloader as api
import numpy as np
from gensim.models import Word2Vec
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import pickle
import openai

In [None]:
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
nltk.download('punkt')

#choose from multiple models https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-twitter-50")

needed_words = ['against', 'no', 'nor', 'not', "don't", 'should', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
updated_stopwords = stopwords.words("english")

for word in needed_words:
  if word in updated_stopwords:
    updated_stopwords.remove(word)

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Code Logic

In [5]:
openai.api_key = "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"

def create_new_data(num, sentiment, stance, topic):
  """
  Generate tweets using openAI
  :param num: Number of tweets. They dont always work
  :param sentiment: have sentiment as a variable as well while generating tweets
  :param stance: for or against the topic
  :param topic: the topic for which you want to generate stance 
  """
  response = openai.Completion.create(
    model="text-davinci-002",
    prompt=f"make {num} {sentiment} tweets that are {stance} {topic}", 
    temperature=0.7,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  tweets = response['choices'][0]['text']
  tweets = tweets.split("\n")
  tweets = [tweet[3:] for tweet in tweets if len(tweet)>5 and tweet[0].isdigit()]

  return tweets

# print(create_new_data(5, "sad", "in favor of", "battery"))

In [6]:
def get_data(topic="Solar Power"):
  """
  Gets all the data for a particular input topic and returns a df
  0 is against, 1 is pro, 2 is neither
  """
  try:
    filename = f'/content/drive/My Drive/University/CSE 573: Stance Detection /Datasets/Output/openai_gen/openai_{topic}.csv'
    topic_df = pd.read_csv(filename)
  except:
    data_path = "/content/drive/MyDrive/University/CSE 573: Stance Detection /Datasets/Final/data.csv"
    # data_path = "/content/drive/MyDrive/University/CSE 573: Stance Detection /Datasets/Final/combined_original.csv"
    df = pd.read_csv(data_path)
    topic_df = df.loc[df['Target'].isin([topic, topic.lower()])]

    try:
      all_topics = list(set(df['Target']))
      all_topics = [each_topic.lower() for each_topic in all_topics if each_topic.lower() in model.vocab]
      
      # check if anything close to the topic exists in our current data
      most_similar = model.wv.most_similar_to_given(topic.lower(), list(all_topics))
      similarity_score = model.wv.similarity(topic.lower(), most_similar)
    except:
      similarity_score = 0

    # Check if topic exists or not in our current data
    if topic_df.empty:


      if similarity_score<0.7:

        # generate a new dataset
        # Add fetching code logic here
        query, target, stance = [], [], []

        for emotion in ["angry", "happy", "sad", "surprise", "sarcastic", "neutral"]:
          query = query + create_new_data(5, emotion, "in favor of", topic)
          target = target + [topic for i in range(0, 5)]
          stance = stance + [1 for i in range(0, 5)]
        
        for emotion in ["angry", "happy", "sad", "surprise", "sarcastic", "neutral"]:
          query = query + create_new_data(5, emotion, "against", topic)
          target = target + [topic for i in range(0, 5)]
          stance = stance + [0 for i in range(0, 5)]

        # print(len(query), len(target), len(stance))

        for emotion in ["angry", "happy", "sad", "surprise", "sarcastic", "neutral"]:
          query = query + create_new_data(5, emotion, "neutral to", topic)
          target = target + [topic for i in range(0, 5)]
          stance = stance + [2 for i in range(0, 5)]

        # print(len(query), len(target), len(stance))

        topic_df = pd.DataFrame(data={"Query":query, "Target":target, "Stance":stance})

        filename = f'/content/drive/My Drive/University/CSE 573: Stance Detection /Datasets/Output/openai_gen/openai_{topic}.csv'
        topic_df.to_csv(filename, index=False)
      
      else:
        topic_df = df.loc[df['Target'].isin([most_similar, most_similar.lower()])]
    
  return topic_df

# print(get_data("Battery"))

In [7]:
def get_sentence_vector(row):
  """
  Get the vectorised form of the sentence
  Can add tfidf*word vect for more precision
  """

  def preprocess(s):
    """
    Preprocessing for sentence, converting to lower case and removing stop words
    """
    s = word_tokenize(s)
    s = [word.lower() for word in s if not word in updated_stopwords]
    return s

  vect = []
  for word in preprocess(row[0]):
    if word in model.vocab:
      vect.append(model[word])
  
  # vect is a list of a list. each word a seperate vector
  # final_vect will return a 1D np array, vector of the sentence
  final_vect = np.mean(np.array(vect), axis=0)
  
  return final_vect

In [121]:
out_df_f1, out_df_acc, out_df_topic, out_df_dataset, out_df_model, out_df_pro_count, our_df_neutral_count, out_df_against_count = [], [], [], [], [], [], [], []

In [8]:
def generate_cluster(topic="Solar Power", model_type="decision tree"):
  """
  Generates a cluster for given topic
  """
  topic_df = get_data(topic)

  vectorized_data = []
  for idx, row in topic_df.iterrows():
    vectorized_data.append(get_sentence_vector(row))

  print(topic_df)

  X = vectorized_data
  y = topic_df["Stance"]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

  if model_type=="kmeans":
    train_model = KMeans(n_clusters=3, random_state=42).fit(X=X_train)
    y_pred = train_model.predict(X_test)

    print("Most representative terms per cluster (based on centroids):")
    for i in range(3):
        tokens_per_cluster = ""
        most_representative = model.wv.most_similar(positive=[train_model.cluster_centers_[i]], topn=10)
        for t in most_representative:
            tokens_per_cluster += f"{t[0]} "
        print(f"Cluster {i}: {tokens_per_cluster}")

  elif model_type=="decision tree":
    train_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
    train_model.fit(X_train, y_train)
    y_pred = train_model.predict(X_test)

  # save the model to disk
  filename = f'/content/drive/My Drive/University/CSE 573: Stance Detection /Datasets/Output/dt_models/backtranslated_{model_type}_{topic}_model.pkl'
  pickle.dump(train_model, open(filename, 'wb'))

  out_df_topic.append(topic)
  out_df_f1.append(f1_score(y_test, y_pred, average='macro')*100)
  out_df_acc.append(accuracy_score(y_test, y_pred))
  out_df_dataset.append("Backtranslated")
  out_df_model.append(model_type)
  try:
    out_df_against_count.append(topic_df["Stance"].value_counts()[0])
  except:
    out_df_against_count.append(0)
  try:
    out_df_pro_count.append(max(topic_df["Stance"].value_counts()[1], 0))
  except:
    out_df_pro_count.append(0)
  try:
    our_df_neutral_count.append(max(topic_df["Stance"].value_counts()[2], 0))
  except:
    our_df_neutral_count.append(0)

  # print("-----------------------------------------")
  
  # print(confusion_matrix(y_test, y_pred))
  # print("Accuracy for {}: {:.2f}%".format(topic, accuracy_score(y_test, y_pred)))
  # print("F-1 score for {}: {:.2f}%".format(topic, f1_score(y_test, y_pred, average='macro')*100))
  # print(classification_report(y_test, y_pred, digits=3))

  # print("-----------------------------------------")
  
  return train_model

# topic = "Religion"
# generate_cluster(topic)

In [11]:
def predict(topic, sentence):
  """
  enter your topic and sentence and get output of what the model predicts!
  """

  model_type = "decision tree"

  try:
    filename = f'/content/drive/My Drive/University/CSE 573: Stance Detection /Datasets/Output/dt_models/backtranslated_{model_type}_{topic}_model.pkl'
    trained_model = pickle.load(open(filename, 'rb'))
  except:
    trained_model = generate_cluster(topic=topic, model_type=model_type)
  
  sentence = sentence.strip()
  prediction = trained_model.predict([get_sentence_vector(sentence)])

  return prediction

# predict(topic="Atheism", sentence=["God is not real"])

In [13]:
def predict_stance(topic, sentence):
  out = predict(topic, sentence)[0]

  if out == 0:
    return "Against"
  elif out == 1:
    return "Support"
  else:
    return "Neutral"

In [None]:
data_path = "/content/drive/MyDrive/University/CSE 573: Stance Detection /Datasets/Testing/FIXED_openai_generated_dataset_FIXED.csv"
real_df = pd.read_csv(data_path)

out= []
for idx, row in real_df.iterrows():
  try:
    pred = predict(row[1], row[0])[0]
    out.append(pred)
  except:
    out.append(-1)

new_df = real_df
new_df["out"] = out
new_df.to_csv('/content/drive/My Drive/University/CSE 573: Stance Detection /Datasets/Output/decision_tree_out_real.csv', index=False)
# for topic in set(df["Target"]):
#   # print(topic)
#   predict(topic=topic, sentence=["God is not real"])

In [132]:
out_data = {
    "Target": out_df_topic,
    "Model": out_df_model,
    "Dataset": out_df_dataset,
    "F1": out_df_f1,
    "Accuracy": out_df_acc,
    "Count of Pro": out_df_pro_count,
    "Count of Neutral": our_df_neutral_count,
    "Count of Against": out_df_against_count
}

out_df = pd.DataFrame(data=out_data)
out_df.to_csv('/content/drive/My Drive/University/CSE 573: Stance Detection /Datasets/Output/decision_tree_out.csv', index=False)

In [18]:
predict_stance(topic="Religion", sentence="God is real")

'Support'

## References that are priceless
1. https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence/
2. https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/