## Extract Emotion Encodings

In [None]:
# Extract the encodings that we will use
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encodings=["love", "anger", "disgust", "fear", "happiness", "sadness", "surprise", "neutral", "other"]
label_encodings.sort()
label_encoder.fit(label_encodings)

# Print out the emotions to check it has been loaded successfully
emotions = label_encoder.classes_
print(emotions)

## Load Classifier and Scaler

Load logistic regression classifier and pre-fit scaler and store the models.

In [None]:
# Set classifier_path to location of the logistic regression classifier
classifier_path = '../../Resources/Models/nli-mpnet-base-v2-LR-classifier.pkl'

try: 
    # Load and store model in sentiment_model
    file = open(classifier_path, 'rb')
    sentiment_model = pickle.load(file)
    
except Exception as e:
    print(f"Error while opening file: {e}")
    
finally:
    file.close()

In [None]:
# Set scaler_path to location of the pre-fit scaler
scaler_path = '../../Resources/Models/sentimentScaler.pkl'

try: 
    # Load and store model in scaler
    file = open(scaler_path, 'rb')
    scaler = pickle.load(file)
    
except Exception as e:
    print(f"Error while opening file: {e}")
    
finally:
    file.close()

## Predict Labels with Multiprocessing

By using SBERT model and the array of parsed sentences, we will now output a corresponding array where each element is a tuple of the predicted label and an array of the raw possibilities for each label. The emotions outputted should match that of the previous cell.

This process uses multiprocessing to efficiently run large-sized corpuses.

In [None]:
# Set sbert_path to location of SBERT model
sbert_path = '../../Resources/Models/nli-mpnet-base-v2'
transformer = ST(sbert_path)

In [None]:
# Encoding through multiprocessing
def encode_sentence(sent):
    
    # Encode chunk of sentences in parsed_sents array
    sentence_embedding = transformer.encode(sent, show_progress_bar=False)
    
    return sentence_embedding

In [None]:
# When using multiple processes, important to eventually close them to avoid memory/resource leaks
try:
    start = perf_counter()

    # Define a thread Pool to process multiple sentences simultaneously
    # Default set to num_cores, but may change number of processes depending on instance
    cores_used = num_cores - 1
    p_encode = Pool(processes=cores_used)
    
    # Apply function with Pool to array
    chunksize = int(len(parsed_sents) / cores_used)
    sentence_embeddings = p_encode.map(encode_sentence, parsed_sents, chunksize)
    
    end = perf_counter()
    
    total_minutes = (end - start) / 60
    total_seconds = (end - start) % 60

    print(f"Took {int(total_minutes)}min {total_seconds:.2f}s to encode {len(parsed_sents)} sentences.")

except Exception as e:
    print(f"Error occurred while encoding sentences: {e}")

finally:
    p_encode.close()

In [None]:
# Both scaler and sentiment_model should exist before running this cell
if scaler is not None and sentiment_model is not None: 
    standardized = scaler.transform(sentence_embeddings)
        
    y_pred_numeric = sentiment_model.predict(standardized)
    y_pred_string = label_encoder.inverse_transform(y_pred_numeric)
        
    # Call the predict function on our sentences
    raw_predictions = sentiment_model.predict_proba(standardized)
        
    results = list(zip(y_pred_string, raw_predictions))
    
    # Print first element of array
    print(results[0:5])
else: 
    print("Please load scaler and sentiment model.")

## Create Dataframe

Create a dataframe to show the predicted labels and each estimated value for each emotion.

In [None]:
# Make column list
columns = ['GOID', 'Date', 'Pub ID', 'Sentence', 'Label']
columns.extend(emotions)

# Create dictionary for dataframe
data = {}

for col in columns: 
    data[col] = []

# Fill out each row using the collected data
for index,result in enumerate(results):
    data['GOID'].append(parsed_goids[index])
    data['Date'].append(parsed_dates[index])
    data['Pub ID'].append(parsed_pubids[index])
    data['Sentence'].append(parsed_sents[index])
    data['Label'].append(result[0])
    
    for i, emotion in enumerate(emotions):
        data[emotion].append(result[1][i])

results_df = pd.DataFrame(data=data, columns=columns)

In [None]:
# View final dataframe
results_df

In [None]:
# Create document-level dataframe
means_df = results_df.groupby(['GOID'], as_index=True).agg({'Date': 'first',
                                                            'anger':'mean',
                                                            'disgust':'mean',
                                                            'fear':'mean',
                                                            'happiness':'mean',
                                                            'love':'mean',
                                                            'neutral':'mean',
                                                            'other':'mean',
                                                            'sadness':'mean',
                                                            'surprise':'mean'})

In [None]:
# View document-level dataframe
means_df

In [None]:
# Save output to file
means_df.to_csv(means_output_file)

In [None]:
# Save output to file
results_df.to_csv(results_output_file)