# 1. INTRODUCTION

# 2. IMPORTS

In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 719 kB 5.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 48.7 MB/s 
[K     |████████████████████████████████| 4.4 MB 47.1 MB/s 
[K     |████████████████████████████████| 362 kB 49.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 33.8 MB/s 
[K     |████████████████████████████████| 101 kB 10.7 MB/s 
[K     |████████████████████████████████| 140 kB 58.0 MB/s 
[K     |████████████████████████████████| 212 kB 49.7 MB/s 
[K     |████████████████████████████████| 596 kB 45.9 MB/s 
[K     |████████████████████████████████| 127 kB 47.7 MB/s 
[K     |████████████████████████████████| 144 kB 47.5 MB/s 
[K     |████████████████████████████████| 94 kB 2.7 MB/s 
[K     |████████████████████████████████| 271 kB 61.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 42.7 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the f

In [2]:
#hide
from fastbook import *

In [3]:
!pip install spacy==2.2.4
from fastai.text.all import *
import pandas as pd
import re
import spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==2.2.4
  Downloading spacy-2.2.4-cp37-cp37m-manylinux1_x86_64.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 4.8 MB/s 
Collecting thinc==7.4.0
  Downloading thinc-7.4.0-cp37-cp37m-manylinux1_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 37.3 MB/s 
[?25hCollecting plac<1.2.0,>=0.9.6
  Downloading plac-1.1.3-py2.py3-none-any.whl (20 kB)
Collecting srsly<1.1.0,>=1.0.2
  Downloading srsly-1.0.5-cp37-cp37m-manylinux2014_x86_64.whl (184 kB)
[K     |████████████████████████████████| 184 kB 49.7 MB/s 
[?25hCollecting catalogue<1.1.0,>=0.0.7
  Downloading catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting blis<0.5.0,>=0.4.0
  Downloading blis-0.4.1-cp37-cp37m-manylinux1_x86_64.whl (3.7 MB)
[K     |████████████████████████████████| 3.7 MB 29.1 MB/s 
Installing collected packages: srsly, plac, catalogue, blis, thinc, spacy
  Attemp

# 3. CONSTS

In [4]:
DATA_PATH = Path.cwd()/'gdrive'/'MyDrive'

# 4. FUNCTIONS

1. `clean_tweet` -> cleaning a Tweet (removing hyperlinks and noise)
2. `load_classifier` -> function to load all classifiers
3. `predict_tweets` -> main function to predict (unknown) tweets in a dataframe

## 4.1 Clean Tweets

In [5]:
def clean_tweet(tweet):
  '''
  function to clean tweets
  '''
  pattern = re.compile(r'@\w+|http[s]*\://[\w\./]+|[\.,:;\n\t\"\'-\?“”#&]+')
  clean_tweet = re.sub(pattern, ' ', tweet)
  clean_tweet = re.sub(r'\s{2,}', ' ', clean_tweet)
  clean_tweet = clean_tweet.strip().lower()
  return clean_tweet


## 4.2 Load Classifier

In [6]:
def load_classifier(path):
  '''
  function to load trained classifier from TextAnalysis.ipynb from folder PATH/'Classifier'

  return:
  -------
  returns four classifier: ron, religions, social_spheres, sentiment from folder PATH/'Classifier/'
  '''
  ron_classifier = load_learner(path/'Classifier/ft_twitter_religion_classifier.pkl')
  rel_classifier = load_learner(path/'Classifier/ft_twitter_religions_classifier.pkl')
  soc_classifier = load_learner(path/'Classifier/ft_twitter_social_fields_classifier.pkl')
  sen_classifier = load_learner(path/'Classifier/ft_twitter_sentiment_classifier.pkl')
  
  return (ron_classifier, rel_classifier, soc_classifier, sen_classifier)

## 4.3 `religion` Predictor Function
If a certain religion is explicitly mentioned in the tweet, is has to appear in the column.

In [7]:
def predict_religions(x, model, thresh=0.5):
  '''
  function to predict religion in tweet; based on DL classifier but also including some hard coded stuff

  arguments:
  ----------
  x: the tweet to predict
  model: the model
  thresh: thresh to indicate which labels should be included, default by fastai is 0.5

  return:
  -------
  (str) religion predicions
  '''

  # getting categories according to threshold
  preds = model.predict(x)[2] > thresh
  rels = model.dls.multi_categorize.vocab[preds]

  # check if some religions are explicitly mentioned but ignored by model prediction 
  if (('christian' in x) and not ('christianity' in rels)):
    rels.append('christianity')
  if (('muslim' in x) and not ('islam' in rels)):
    rels.append('islam')
  if (('islam' in x) and not ('islam' in rels)):
    rels.append('islam')
  if (('allah' in x) and not ('islam' in rels)):
    rels.append('islam')
  if (('christ' in x) and not ('christianity' in rels)):
    rels.append('christianity')
  if (('jesus' in x) and not ('christianity' in rels)):
    rels.append('christianity')
  if (('hindu' in x) and not ('hindu' in rels)):
    rels.append('hindu')
  if (('jew' in x) and not ('judaism' in rels)):
    rels.append('judaism')
  if (('judaism' in x) and not ('judaism' in rels)):
    rels.append('judaism')
  if (('atheism' in x) and not ('atheism' in rels)):
    rels.append('atheism')
  if (('buddhism' in x) and not ('buddhism' in rels)):
    rels.append('buddhism')
  if (('buddhist' in x) and not ('buddhism' in rels)):
    rels.append('buddhism')
  if (('buddha' in x) and not ('buddhism' in rels)):
    rels.append('buddhism')
    
  return ' '.join(rels) 



## 4.4 Predict Tweets

In [8]:
def predict_tweets(path):
  '''
  function to predict unknown tweets stored in an xlsx file in folder (xlsx files need to have a certain format, see TwitterHarvester.ipynb)

  arguments:
  ----------
  path: path to folder with xlsx files
  '''

  # LOAD CLASSIFIER

  ron_clas, rel_clas, soc_clas, sen_clas = load_classifier(DATA_PATH)

  # no files in folder
  try:
    next(path.glob('*.xlsx'))
  except StopIteration:
    print("No files found!")
    return None
  
  # if files in folder
  for file in path.glob('*.xlsx'):
    ### LOAD AND PREPARE DF

    print(file)

    df = pd.read_excel(file, index_col=0)
    
    if 'Unnamed: 0' in df.columns:
      df.drop(columns=['Unnamed: 0'], inplace=True)
    
    # creating clean version of tweets in separate column
    df['clean_tweet'] = df['text'].apply(clean_tweet)

    ### PREDICT TWEETS
    df['ron'] = df.apply(lambda x: ron_clas.predict(x['clean_tweet'])[0], axis=1)
    df['sentiment'] = df.apply(lambda x: sen_clas.predict(x['clean_tweet'])[0], axis=1)
    df['religion'] = df.apply(lambda x: predict_religions(x['clean_tweet'], rel_clas, 0.8), axis=1)
    df['social_spheres'] = df.apply(lambda x: ' '.join(soc_clas.predict(x['clean_tweet'])[0]), axis=1)

    # SAVE PREDICTED DF
    filename = str(file.stem)+'_predicted.xlsx'
    df.to_excel(path/filename)

# 5. Analysis

In [9]:
predict_tweets(DATA_PATH/'Twitter/Religion/DHQ/to_predict/')

/content/gdrive/MyDrive/ReligionML/Data/Twitter/Religion/DHQ/to_predict/twitter_rel_2022-6-21.xlsx


/content/gdrive/MyDrive/ReligionML/Data/Twitter/Religion/DHQ/to_predict/twitter_rel_2022-6-24.xlsx
