In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn
import string
import warnings
import re
from scipy import sparse
from IPython.display import display, Latex, Markdown
warnings.filterwarnings('ignore')
import data_cleaning as dc

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('tagsets_json')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/chasty2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/chasty2/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/chasty2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/chasty2/nltk_data...
[nltk_data]   Package averaged_perceptron_ta

True

In [3]:
### Text Processing

def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ Normalizes case and handles punctuation
    Inputs:
        text: str: raw text
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs:
        list(str): tokenized text
    """
    posMapping = {
    # "First_Letter by nltk.pos_tag":"POS_for_lemmatizer"
        "N":'n',
        "V":'v',
        "J":'a',
        "R":'r'
    }

    
    # Create regex to catch URLs
    url_regex = re.compile(r'''(
        (?:https?://)?        ## Optionally match http:// or https://
        (?:www\.)?            ## Optionally match www.
        [\w.-]+\.\w+          ## Match multiple domains (example.com or sub.domain.co.uk)
        (?:[/?#][^\s]*)?      ## Optionally match paths, queries, or fragments
    )''', re.VERBOSE)
    
    ### Process string
    # Remove URLs
    text = url_regex.sub("", text).strip()
    # Remove all ('s) e.g. she's -> she
    text = re.sub("'s", "", text).strip()
    # Omit other apostrophes e.g. don't -> dont
    text = re.sub("'", "", text).strip()
    # swap all other punctuation with ' '
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    # Set to lowercase
    text = str.lower(text)
    
    ### Process tokens
    # tokenize string
    tokenized_text = nltk.word_tokenize(text)
    # Tag tokens
    tokenized_text = nltk.pos_tag(tokenized_text)
    # lemmatize tokens, converting pos tags based on mappings above
    lemmatized_tokens = []
    for word,tag in tokenized_text:
        try:
            lemma = lemmatizer.lemmatize(word, pos=posMapping[tag[0]])
        except KeyError:
            # Anything not caught by posMapping dict has pos 'n'
            lemma = lemmatizer.lemmatize(word, pos='n')
        # except:
        #     # Ignore other exceptions
        #     continue
        lemmatized_tokens.append(lemma)

    return lemmatized_tokens

def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    """ process all text in the dataframe using process() function.
    Inputs
        df: pd.DataFrame: dataframe containing a column 'text' loaded from the CSV file
        lemmatizer: an instance of a class implementing the lemmatize() method
                    (the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
    Outputs
        pd.DataFrame: dataframe in which the values of text column have been changed from str to list(str),
                        the output from process() function. Other columns are unaffected.
    """
    df['text'] = df['text'].apply(process)
    return df

### Feature Construction
def create_features(processed_tweets, stop_words):
    """ creates the feature matrix using the processed tweet text
    Inputs:
        processed_tweets: pd.DataFrame: processed tweets read from train/test csv file, containing the column 'text'
        stop_words: list(str): stop_words by nltk stopwords (after processing)
    Outputs:
        sklearn.feature_extraction.text.TfidfVectorizer: the TfidfVectorizer object used
            we need this to tranform test tweets in the same way as train tweets
        scipy.sparse.csr.csr_matrix: sparse bag-of-words TF-IDF feature matrix
    """
    # Convert processed tweets text values to list of strings, with one tweet per string
    tweets_list = processed_tweets["text"].apply(lambda x: ' '.join(x)).tolist()

    # Learn vocabulary and idf, return document-term matrix
    tfidf = sklearn.feature_extraction.text.TfidfVectorizer(
        min_df=2, lowercase=False, stop_words=stop_words
    )
    X = tfidf.fit_transform(tweets_list)

    return tfidf, X

def create_labels(processed_tweets):
    """ creates the class labels from screen_name
    Inputs:
        processed_tweets: pd.DataFrame: tweets read from train file, containing the column 'screen_name'
    Outputs:
        numpy.ndarray(int): dense binary numpy array of class labels
    """
    # Define mapping
    label_dict = {
        'realDonaldTrump': 0,
        'mike_pence': 0,
        'GOP': 0,
        'HillaryClinton': 1,
        'timkaine': 1, 
        'TheDemocrats': 1
    }

    # Apply mapping, default to NaN (screen name not found)
    label_series = processed_tweets['screen_name'].map(lambda x: label_dict.get(x, np.nan))

    return label_series

### Classification
def learn_classifier(X_train, y_train, kernel):
    """ learns a classifier from the input features and labels using the kernel function supplied
    Inputs:
        X_train: scipy.sparse.csr.csr_matrix: sparse matrix of features, output of create_features()
        y_train: numpy.ndarray(int): dense binary vector of class labels, output of create_labels()
        kernel: str: kernel function to be used with classifier. [linear|poly|rbf|sigmoid]
    Outputs:
        sklearn.svm.SVC: classifier learnt from data
    """
    
    classifier = sklearn.svm.SVC(kernel=kernel)
    classifier.fit(X_train, y_train)

    return classifier

def evaluate_classifier(classifier, X_validation, y_validation):
    """ evaluates a classifier based on a supplied validation data
    Inputs:
        classifier: sklearn.svm.classes.SVC: classifer to evaluate
        X_validation: scipy.sparse.csr.csr_matrix: sparse matrix of features
        y_validation: numpy.ndarray(int): dense binary vector of class labels
    Outputs:
        double: accuracy of classifier on the validation data
    """
    # Run classification of predicted political party based on each tweet
    predicted_labels = classifier.predict(X_validation)

    # Calculate accuracy of predictions
    accuracy = sklearn.metrics.accuracy_score(y_validation, predicted_labels)
    
    return accuracy

In [None]:
### Test ML task
