# IMPORTS

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Part 1

In [2]:

def clean_agreement(df):
    '''
    This function takes in a dataframe and filters out the rows with negative label higher than positive label
    '''
    rslt_df = df.loc[df['labels_negative'] < df['labels_positive']]
    return rslt_df



def set_agreement_threshold(df):
    '''
    This function takes in our cleaned dataframe
    and filters the rows based on a threshold for the agreement score

    We are using a threshold of 0.4
    '''
    result_df = df.loc[df['agreement'] >= 0.4]
    return result_df

def text_preprocessing(df):
    
    """Converts apostrophe suffixes to words, replace webpage links with url, annotate hashtags and mentions, remove a selection of punctuation, and convert all words to lower case.
    Args:
        df (DataFrame): dataframe containing 'text' column to convert
    Returns:
        df (DataFrame): dataframe with converted 'text' column 
    """
    # Remove username
    df['text'] = df['text'].str.replace('@[a-zA-Z0-9_]+','',regex =True)
    # Remove Url
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',regex =True)
    # Remove special characters
    df['text'] = df['text'].str.replace('[.%+*/0-9?&#]+','',regex = True)
    # Remove words with 2 or fewer letters
    df['text'] = df['text'].str.replace(r'\b\w{1,2}\b', '', regex =True)
    # Remove special characters
    df['text'] = df['text'].str.replace('([-,]+)|((\')+)|([;:()!@#=$]+)','',regex =True)
    # convert strings into lower case
    df['text'] = df['text'].apply(lambda x:" ".join(x.lower() for x in x.split()))
    # Removing stop words
    df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords.words('english')))
    return df

In [3]:
df = pd.read_csv('train.csv')

#This line of code removes the rows where the negative label was higher than the positive label
df = clean_agreement(df)

#This line of code removes rows where the agreement score is less than 0.4
df = set_agreement_threshold(df)

#This line of code carries out text preprocessing on the text data
df = text_preprocessing(df)


In [4]:
df.head()

Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement,id
1,10.18356/5950d914-en,b6415a528064b85fdde4b4c61239ed3a,manufacturing value added percentage gdp stood...,9,0,3,1.0,2
2,10.18356/31959a6d-en,29127def7e81b999b87c8e887a4fe882,"share share question"" volume gender politics f...",5,2,7,0.555556,3
4,10.1787/9789264119536-11-en,8b7d8c6c605fe9695d08ab03d601e0e9,question considerable policy relevance extent ...,10,1,4,0.6,5
7,10.6027/9789289350433-5-en,1cd35b929d9d744b60c2c2a7dc8575fe,capacity building text adaptation rather conce...,13,1,8,0.777778,8
8,10.1787/5jlwvz85537c-en,439857f2a5cb22fc9f73878bf6149316,one method induce sensation travelling high sp...,11,0,3,1.0,9


# Part 2

# Part 3