In [6]:
# Sentiment analysis and output of sentiment analysis to file

# Import modules
import json, re, sys, os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
# Add custom directory to path
sys.path.append(os.path.abspath("../py/"))
# Import script from directory
import twit_data_scraper

def process_text_textblob(text):
    '''
    Clean text for sentiment analysis
    Dependencies - None
    '''
    print('Cleaning text (for TextBlob)')
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'@[A-Za-z0-9]+','', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    print('Text cleaned')
    return text

def roberta_load():
    '''
    Load sentiment analysis model - Roberta
    Dependencies - None
    '''
    # load model and tokenizer
    print("Loading Roberta and autokenizer.")
    roberta = "cardiffnlp/twitter-roberta-base-sentiment"
    model = AutoModelForSequenceClassification.from_pretrained(roberta)
    tokenizer = AutoTokenizer.from_pretrained(roberta)
    model_params = [model, tokenizer]
    print("Roberta and autotokenizer loaded.")
    return model_params

def roberta_sent_score(df, col_name, model, tokenizer,data_path):
    '''
    Calculate sentiment scores and write them to text file
    Dependencies - roberta_load (called from main)
    '''
    print("Calculating and recording tweet sentiment score.")
    counter = 0
    # Clean tweet text
    def clean_text(tweet):
        '''
        Clean tweet text
        Dependencies - None
        '''
        tweet_words=[]
        for word in tweet.split(' '):
            if word.startswith('@') and len(word) > 1:
                word = '@user'
            elif word.startswith('http'):
                word = "http"
            tweet_words.append(word)
        tweet_proc = " ".join(tweet_words)
        return tweet_proc
    # Calculate tweet sentiment score and write to file with identifier tweet_id
    for i in range(len(df)):
        score_line = {}
        tweet_id = df['tweet_id'][i]
        tweet = df[col_name][i]
        tweet_proc = clean_text(tweet)
        # sentiment analysis
        labels = ['negative_score', 'neutral_score', 'positive_score']
        encoded_tweet = tokenizer(tweet_proc, return_tensors='pt')
        output = model(**encoded_tweet)
        score = output[0][0].detach().numpy()
        score = softmax(score)
        score_line = {'tweet_id':str(tweet_id), labels[0]:str(score[0]), 
                      labels[1]:str(score[1]), labels[2]:str(score[2])}
        print(score_line)
        # Write calculated scores to file
        with open(data_path,'a+') as fhandle:
            json.dump(score_line, fhandle)
            fhandle.write("\n")
        counter += 1
#         if counter >= 10:
#             break
    print("Tweet sentiment score calculated and recorded.")
    return 0

def senti_comparison(x):
    '''
    Compare values of negative, neutral and positive emotions and return the dominant emotion
    Dependencies - None
    '''
    if (x['negative_score'] > x['neutral_score'] and 
    x['negative_score'] > x['positive_score']):
        result = 'Negative'
    elif (x['positive_score'] > x['neutral_score'] and 
    x['positive_score'] > x['negative_score']):
        result = 'Positive'
    elif (x['neutral_score'] > x['negative_score'] and 
    x['neutral_score'] > x['positive_score']):
        result = 'Neutral'
    else:
        result = 'Undecided'
    return result

def df_types(df):
    '''
    Explicitly describe data type in dataframe
    Needed for automatic data summary scripts
    Dependencies - None
    '''
    pass
    
def file_path(rel_path, data_file):
    '''
    Form paths using os library and user inputs
    Dependencies - None
    '''
    # Create path to data files
    print("Generating data file path.")
    data_dir = os.path.abspath(rel_path)
    data_path = os.path.join(data_dir,data_file)
    print('File path generation completed.')
    return data_path
    

def main():
    inpt_fname = 'test.jsonuk'
    otpt_fname = 'test_sentiment.jsonuk'
    merged_fname = 'test_with_sentiments.json'
    data_path = file_path("data/",inpt_fname)
    print(data_path)
    #Initialize model and tokenizer
    model, tokenizer = roberta_load()
    # Read data into pandas dataframe based on file format
    if inpt_fname.split('.')[1] == 'jsonuk':
        df=twit_data_scraper.tweetfile_2_dataframe(data_path)
    elif inpt_fname.split('.')[1] == 'csv':
        df=pd.read_csv(data_path)
    data_path = file_path("data/",otpt_fname)
    roberta_sent_score(df,'tweet_text', model, tokenizer,data_path = data_path)
    print(data_path)
    df2 = twit_data_scraper.tweetfile_2_dataframe(data_path)
    df['tweet_id'] =  df['tweet_id'].astype(str)
    # Merge data frames
    df3=pd.merge(df,df2,how='left',on='tweet_id')
    # Determin overall sentiment and add it to data frame
    df3['net_score']=df3.apply(senti_comparison, axis = 1)
    # Write merged data frame to file
    df3.to_json(file_path("data/", merged_fname))
    df4 = pd.read_json(file_path("data/", merged_fname))
    print('----------------------------------------')
    print(df4)
    return 0

if __name__ == "__main__":
    main()


Generating data file path.
File path generation completed.
/home/nuclear/Stuff/PythonPrograms/Twitter-Portfolio-Project/data/test.jsonuk
Loading Roberta and autokenizer.
Roberta and autotokenizer loaded.
Converting file to dataframe.
File to dataframe conversion completed.
Generating data file path.
File path generation completed.
Calculating and recording tweet sentiment score.
{'tweet_id': '1567151913602269184', 'negative_score': '0.022226034', 'neutral_score': '0.33841527', 'positive_score': '0.6393587'}
{'tweet_id': '1567151655560388608', 'negative_score': '0.4399557', 'neutral_score': '0.52734315', 'positive_score': '0.03270108'}
{'tweet_id': '1567151271232110594', 'negative_score': '0.11765074', 'neutral_score': '0.7982512', 'positive_score': '0.08409795'}
{'tweet_id': '1567150906902208513', 'negative_score': '0.39192396', 'neutral_score': '0.58538103', 'positive_score': '0.022694923'}
{'tweet_id': '1567150877508501505', 'negative_score': '0.89643264', 'neutral_score': '0.0989284