# Q1. Implement Evaluation Functions

In [1]:
from collections import defaultdict
import gzip

## Precision

In [2]:
def get_precision(y_pred, y_true):
    n = len(y_pred)
    TP = 0
    FP = 0
    for i in range(n):
        if y_pred[i] == 1:
            if y_true[i] == 1:
                TP += 1
            else:
                FP +=1
    
    # edge case
    if TP + FP == 0:
        precision = 1
    # general case  
    else:
        precision = TP/(TP + FP)
        
    return precision

## Recall

In [3]:
def get_recall(y_pred, y_true):
    n = len(y_pred)
    TP = 0
    FN = 0
    for i in range(n):
        if y_true[i] == 1:
            if y_pred[i] == 1:
                TP += 1
            else:
                FN +=1
                
    # edge case
    if TP + FN == 0:
        recall = 1
    # general case 
    else:
        recall = TP/(TP + FN)
        
    return recall

## F1-Score

In [4]:
def get_fscore(y_pred, y_true):
    precision = get_precision(y_pred, y_true)
    recall = get_recall(y_pred, y_true)
    
    # edge case
    if (precision == 0) or (recall == 0):
        fscore = 0
    # general case 
    else: 
        fscore = 2*precision*recall/(precision+recall)
        
    return fscore

## Load File

In [5]:
def load_file(data_file):
    words = []
    labels = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
            i += 1
    return words, labels

# Q2. Implement Baselines

## (a) Majority Baseline

In [6]:
def all_complex_feature(words):
    # label all words as complex
    n = len(words)
    return n * [1]

In [7]:
def all_complex(data_file):
    words, y_true = load_file(data_file)
    
    y_pred = all_complex_feature(words)
    
    precision = get_precision(y_pred, y_true)
    recall = get_recall(y_pred, y_true)
    fscore = get_fscore(y_pred, y_true)
    
    performance = [precision, recall, fscore]
    return performance

#### The precision, recall, and f-score on training data are the following

In [8]:
all_complex("complex_words_training.txt")

[0.43133333333333335, 1.0, 0.6027014438751747]

#### The precision, recall, and f-score on training data are the following

In [9]:
all_complex("complex_words_development.txt")

[0.418, 1.0, 0.5895627644569816]

## (b) Word Length Baseline

In [10]:
def length_threshold_feature(words, threshold):
    # label words based on length
    result = []
    n = len(words)
    
    for i in range(n):
        # if the lenght is shorter than threshold, then it's a simple word
        if len(words[i]) < threshold:
            result += [0]
        else:
            result += [1]
            
    return result

In [11]:
def word_length_threshold(training_file, development_file):

    train_words, train_true = load_file(training_file)
    dev_words, dev_true = load_file(development_file)

    tran_len = len(train_words)
    max_len = 1
    for i in range(tran_len):
        if len(train_words[i]) > max_len:
            max_len = len(train_words[i])


    best_thre = 1
    best_f = 0
    
    #FInd best threshold that gives highest f1-score
    for thre in range(1, max_len+2):
        cur_train_pred = length_threshold_feature(train_words, thre)
        cur_fscore = get_fscore(cur_train_pred, train_true)
        if cur_fscore > best_f:
            best_f = cur_fscore
            best_thre = thre


    train_pred = length_threshold_feature(train_words, best_thre)
    dev_pred = length_threshold_feature(dev_words, best_thre)

    tprecision = get_precision(train_pred, train_true)
    trecall = get_recall(train_pred, train_true)
    tfscore = get_fscore(train_pred, train_true)

    dprecision = get_precision(dev_pred, dev_true)
    drecall = get_recall(dev_pred, dev_true)
    dfscore = get_fscore(dev_pred, dev_true)

    training_performance = [tprecision, trecall, tfscore]
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

#### The precision, recall, and f-score on training data, and development data are the following

In [12]:
word_length_threshold("complex_words_training.txt", "complex_words_development.txt")

([0.5985877240630092, 0.8516228748068007, 0.7030303030303029],
 [0.6053511705685619, 0.8660287081339713, 0.7125984251968505])

## (c) Word Frequency Baseline

In [13]:
def load_ngram_counts(ngram_counts_file): 
    # load words frequency counts
    counts = defaultdict(int) 
    with gzip.open(ngram_counts_file, 'rt') as f: 
        for line in f:
            token, count = line.strip().split('\t') 
            if token[0].islower(): 
                counts[token] = int(count) 
    return counts

In [14]:
counts = load_ngram_counts("ngram_counts.txt.gz")

In [15]:
def frequency_threshold_feature(words, threshold, counts):
    # label words based on frequency
    n = len(words)
    result = []
    for i in range(n):
        if counts[words[i]] < threshold:
            result += [1]
        else:
            result += [0]
    return result

In [16]:
def word_frequency_threshold(training_file, development_file, counts):

    train_words, train_true = load_file(training_file)
    dev_words, dev_true = load_file(development_file)
    
    train_len = len(train_words)
    
    train_counts = {}
    freq = []
    
    for i in range(train_len):
        train_counts[train_words[i]] = counts[train_words[i]]
        
        #if this word is complex, we put its frequency into to a list 'freq'
        if train_true[i] == 1:
            if counts[train_words[i]] not in freq:
                freq += [counts[train_words[i]]]
    
    # add lower bound into frequency list
    if 0 not in freq:
        freq += [0]
    
    # add upper bound into frequenvy list
    add_max = max(freq) + 1
    freq += [add_max]
    
    # find best threshold that gives highest f1-score
    best_thre = 0
    best_f = 0
    for thre in freq:
        cur_train_pred = frequency_threshold_feature(train_words, thre, counts)
        cur_fscore = get_fscore(cur_train_pred, train_true)
        if cur_fscore > best_f:
            best_f = cur_fscore
            best_thre = thre


    train_pred = frequency_threshold_feature(train_words, best_thre, counts)
    dev_pred = frequency_threshold_feature(dev_words, best_thre, counts)

    tprecision = get_precision(train_pred, train_true)
    trecall = get_recall(train_pred, train_true)
    tfscore = get_fscore(train_pred, train_true)

    dprecision = get_precision(dev_pred, dev_true)
    drecall = get_recall(dev_pred, dev_true)
    dfscore = get_fscore(dev_pred, dev_true)


    training_performance = [tprecision, trecall, tfscore]
    development_performance = [dprecision, drecall, dfscore]
    return training_performance, development_performance

#### The precision, recall, and f-score on training data, and development data are the following

In [17]:
word_frequency_threshold("complex_words_training.txt", "complex_words_development.txt", counts)

([0.5652637187000533, 0.8199381761978362, 0.6691895301166825],
 [0.556782334384858, 0.8444976076555024, 0.6711026615969581])

# Q3. Implement Classifiers

In [18]:
import numpy as np

## (a) Naive Bayes

#### This function carries out feature engineering based on the orginal data sets into matrices as inputs for the model training and testing later

In [19]:
def make_data_matrix(training_file, development_file, counts):
    
    # compute train_X and train_Y matrice based on training data
    t_words, t_y_true = load_file(training_file)
    t_n = len(t_words)
    
    t_words_len_l = []
    t_words_freq_l = []
    
    # add features such as word length and word frequency
    for i in range(t_n):
        t_words_len_l += [len(t_words[i])]
        t_words_freq_l += [counts[t_words[i]]]
    
    t_words_len_l_mean = np.mean(t_words_len_l)
    t_words_freq_l_mean = np.mean(t_words_freq_l)
    t_words_len_l_sd = np.std(t_words_len_l)
    t_words_freq_l_sd = np.std(t_words_freq_l)
    
    # normalization
    for i in range(t_n):
        t_words_len_l[i] = (t_words_len_l[i] - t_words_len_l_mean)/t_words_len_l_sd
        t_words_freq_l[i] = (t_words_freq_l[i] - t_words_freq_l_mean)/t_words_freq_l_sd
    
    t_train = []
    
    for i in range(t_n):
        t_single_row = []
        t_single_row += [t_words_len_l[i]]
        t_single_row += [t_words_freq_l[i]]
        t_train += [t_single_row]

    
    # compute test_X and test_Y matrice based on development data
    d_words, d_y_true = load_file(development_file)
    d_n = len(d_words)
    
    d_words_len_l = []
    d_words_freq_l = []
    
    # add features
    for i in range(d_n):
        d_words_len_l += [len(d_words[i])]
        d_words_freq_l += [counts[d_words[i]]]
    
    # still using the mean and standard deviation from training data to do normalization
    for i in range(d_n):
        d_words_len_l[i] = (d_words_len_l[i] - t_words_len_l_mean)/t_words_len_l_sd
        d_words_freq_l[i] = (d_words_freq_l[i] - t_words_freq_l_mean)/t_words_freq_l_sd
    
    d_train = []
    
    for i in range(d_n):
        d_single_row = []
        d_single_row += [d_words_len_l[i]]
        d_single_row += [d_words_freq_l[i]]
        d_train += [d_single_row]
    
    
    return np.array(t_train), np.array(t_y_true), np.array(d_train), np.array(d_y_true), 

#### Run Naive Bayes Models

In [20]:
from sklearn.naive_bayes import GaussianNB

In [21]:
 def naive_bayes(training_file, development_file, counts):

        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix(training_file, development_file, counts)      
        
        clf = GaussianNB()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output

In [22]:
naive_bayes("complex_words_training.txt","complex_words_development.txt", counts)

((0.4918351477449456, 0.9775888717156105, 0.6544231764097258),
 (0.4700352526439483, 0.9569377990430622, 0.6304176516942475))

## (b) Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
 def logistic_regression(training_file, development_file, counts):
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix(training_file, development_file, counts)      
        
        clf = LogisticRegression()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output

In [25]:
logistic_regression("complex_words_training.txt","complex_words_development.txt", counts)

((0.7206751054852321, 0.6599690880989181, 0.6889874949576441),
 (0.7229219143576826, 0.6866028708133971, 0.7042944785276073))

# Q4. Implement Your Own Model 

## Import count_syllables function to add feature "number of syllables"

In [26]:
import re

## Counts number of syllables (from eayd.in/?p=232)
def count_syllables(word): 
    word = word.lower() 
    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables
    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately'] 
    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']
    pre_one = ['preach']
 
    syls = 0 #added syllable number
    disc = 0 #discarded syllable number
 
    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls
 
    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.) 
    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1
 
    #3) discard trailing "e", except where ending is "le"   
    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while'] 
    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass 
        else :
            disc+=1
 
    #4) check if consecutive vowels exists, triplets or pairs, count them as one. 
    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple
 
    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))
 
    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1
 
    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1
 
    #8) add one if "y" is surrounded by non-vowels and is not in the last word.
    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1
 
    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one. 
    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1 
    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1
 
    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1
 
    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly. 
    if word[:2] == "co" and word[2] in 'eaoui' :
        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1
 
    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly. 
    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1
 
    #13) check for "-n't" and cross match with dictionary to add syllable.
    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"] 
    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass  
 
    #14) Handling the exceptional words. 
    if word in exception_del :
        disc+=1 
    if word in exception_add :
        syls+=1    
 
    # calculate the output
    return numVowels - disc + syls

## Import Wordnet and build function to add features "number of synoyms" and "number of antonyms"

In [27]:
import nltk 
from nltk.corpus import wordnet 

In [28]:
def count_syn_ant(word):
    synonyms = [] 
    antonyms = [] 
  
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name()) 
                
    return len(set(synonyms)), len(set(antonyms))

## Add sentence complexituy features, "length of the sentence", "average word length", and "average word frequency"

### Update load file function to load sentences

In [29]:
def load_file_new(data_file):
    words = []
    labels = []   
    sentence = [] 
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
                
                sentence_str = line_split[3].split()
                sentence_list = [i for i in sentence_str if i not in ",.!?:;@#$%^&*()``~''--+=|\/<>"]   
                sentence.append(sentence_list)
                
            i += 1
    return words, labels, sentence

### Length of the Sentence

In [30]:
def sentence_words_number(sentence):
    return len(sentence)

### Average Word Length 

In [31]:
def sentence_words_avg_len(sentence):
    n = len(sentence)
    total_char = 0
    for i in range(n):
        total_char += len(sentence[i])
    return total_char/n

### Average Word Frequency

In [32]:
def sentence_avg_word_freq(sentence, counts):
    n = len(sentence)
    total_freq = 0
    for i in range(n):
        total_freq += counts[sentence[i]]
    return total_freq/n

## Now I have features "word lenght", "word frequency", "number of syllables", "number of synoyms", and "number of antonyms". I will carry out the following feature engineering function to make matrices for model training and testing.

In [33]:
def make_data_matrix_mine(training_file, development_file, counts):
    
    # compute train_X and train_Y matrice based on training data
    words, y_true, sentences = load_file_new(training_file)
    
    n = len(words)
    words_len_l = []
    words_freq_l = []
    words_syllable_l = []
    words_synonyms_count_l = []
    words_antonyms_count_l = []
    sent_len_l = []
    sent_words_avg_len_l = []
    sent_words_avg_freq_l = []
    
    # add features 
    for i in range(n):
        words_len_l += [len(words[i])]
        words_freq_l += [counts[words[i]]] 
        words_syllable_l += [count_syllables(words[i])]
        words_synonyms_count_l += [count_syn_ant(words[i])[0]]
        words_antonyms_count_l += [count_syn_ant(words[i])[1]]
        sent_len_l += [sentence_words_number(sentences[i])]
        sent_words_avg_len_l += [sentence_words_avg_len(sentences[i])]
        sent_words_avg_freq_l += [sentence_avg_word_freq(sentences[i], counts)]
    
    # compute means and standard deviation 
    words_len_l_mean = np.mean(words_len_l)
    words_freq_l_mean = np.mean(words_freq_l)
    words_syllable_l_mean = np.mean(words_syllable_l)
    words_synonyms_count_l_mean = np.mean(words_synonyms_count_l)
    words_antonyms_count_l_mean = np.mean(words_antonyms_count_l)   
    sent_len_l_mean = np.mean(sent_len_l)
    sent_words_avg_len_l_mean = np.mean(sent_words_avg_len_l)
    sent_words_avg_freq_l_mean = np.mean(sent_words_avg_freq_l)

    words_len_l_sd = np.std(words_len_l)
    words_freq_l_sd = np.std(words_freq_l)
    words_syllable_l_sd = np.std(words_syllable_l)
    words_synonyms_count_l_sd = np.std(words_synonyms_count_l)
    words_antonyms_count_l_sd = np.std(words_antonyms_count_l)
    sent_len_l_sd = np.std(sent_len_l)
    sent_words_avg_len_l_sd = np.std(sent_words_avg_len_l)
    sent_words_avg_freq_l_sd = np.std(sent_words_avg_freq_l)
                                       
    # normalization
    for i in range(n):
        words_len_l[i] = (words_len_l[i] - words_len_l_mean)/words_len_l_sd
        words_freq_l[i] = (words_freq_l[i] - words_freq_l_mean)/words_freq_l_sd
        words_syllable_l[i] = (words_syllable_l[i] - words_syllable_l_mean)/words_syllable_l_sd
        words_synonyms_count_l[i] = (words_synonyms_count_l[i] - words_synonyms_count_l_mean)/words_synonyms_count_l_sd
        words_antonyms_count_l[i] = (words_antonyms_count_l[i] - words_antonyms_count_l_mean)/words_antonyms_count_l_sd          
        sent_len_l[i] = (sent_len_l[i] - sent_len_l_mean)/sent_len_l_sd
        sent_words_avg_len_l[i] = (sent_words_avg_len_l[i] - sent_words_avg_len_l_mean)/sent_words_avg_len_l_sd
        sent_words_avg_freq_l[i] = (sent_words_avg_freq_l[i] - sent_words_avg_freq_l_mean)/sent_words_avg_freq_l_sd
                                                          
    train = []
    
    for i in range(n):
        single_row = []
        single_row += [words_len_l[i]]
        single_row += [words_freq_l[i]]
        single_row += [words_syllable_l[i]]
        single_row += [words_synonyms_count_l[i]]
        single_row += [words_antonyms_count_l[i]]                 
        single_row += [sent_len_l[i]]
        single_row += [sent_words_avg_len_l[i]]
        single_row += [sent_words_avg_freq_l[i]]
        train += [single_row]
                             
            
    # compute test_X and test_Y matrice based on development data
    d_words, d_y_true, d_sentences = load_file_new(development_file)
    
    d_n = len(d_words)
    d_words_len_l = []
    d_words_freq_l = []
    d_words_syllable_l = []
    d_words_synonyms_count_l = []
    d_words_antonyms_count_l = []
    d_sent_len_l = []
    d_sent_words_avg_len_l = []
    d_sent_words_avg_freq_l = []
    
    # add features
    for i in range(d_n):
        d_words_len_l += [len(d_words[i])]
        d_words_freq_l += [counts[d_words[i]]] 
        d_words_syllable_l += [count_syllables(d_words[i])]
        d_words_synonyms_count_l += [count_syn_ant(d_words[i])[0]]
        d_words_antonyms_count_l += [count_syn_ant(d_words[i])[1]]
        d_sent_len_l += [sentence_words_number(sentences[i])]
        d_sent_words_avg_len_l += [sentence_words_avg_len(sentences[i])]
        d_sent_words_avg_freq_l += [sentence_avg_word_freq(sentences[i], counts)]
    
    # still using the mean and standard deviation from training data to do normalization
    for i in range(d_n):
        d_words_len_l[i] = (d_words_len_l[i] - words_len_l_mean)/words_len_l_sd
        d_words_freq_l[i] = (d_words_freq_l[i] - words_freq_l_mean)/words_freq_l_sd
        d_words_syllable_l[i] = (d_words_syllable_l[i] - words_syllable_l_mean)/words_syllable_l_sd
        d_words_synonyms_count_l[i] = (d_words_synonyms_count_l[i] - words_synonyms_count_l_mean)/words_synonyms_count_l_sd
        d_words_antonyms_count_l[i] = (d_words_antonyms_count_l[i] - words_antonyms_count_l_mean)/words_antonyms_count_l_sd          
        d_sent_len_l[i] = (d_sent_len_l[i] - sent_len_l_mean)/sent_len_l_sd
        d_sent_words_avg_len_l[i] = (d_sent_words_avg_len_l[i] - sent_words_avg_len_l_mean)/sent_words_avg_len_l_sd
        d_sent_words_avg_freq_l[i] = (d_sent_words_avg_freq_l[i] - sent_words_avg_freq_l_mean)/sent_words_avg_freq_l_sd
    
                            
    d_train = []
    
    for i in range(d_n):
        d_single_row = []
        d_single_row += [d_words_len_l[i]]
        d_single_row += [d_words_freq_l[i]]
        d_single_row += [d_words_syllable_l[i]]
        d_single_row += [d_words_synonyms_count_l[i]]
        d_single_row += [d_words_antonyms_count_l[i]]  
        d_single_row += [d_sent_len_l[i]]
        d_single_row += [d_sent_words_avg_len_l[i]]
        d_single_row += [d_sent_words_avg_freq_l[i]]
        d_train += [d_single_row]

                              
    return np.array(train), np.array(y_true), np.array(d_train), np.array(d_y_true)

## Try different classificaiton models and compare scores

### (1) Naive Bayes

In [35]:
 def naive_bayes_new(training_file, development_file, counts):  
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine(training_file, development_file, counts)      
        
        clf = GaussianNB()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)
        
        return training_performance, development_performance


#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output

In [36]:
naive_bayes_new("complex_words_training.txt","complex_words_development.txt", counts)

((0.5347826086956522, 0.9505409582689336, 0.6844741235392322),
 (0.524, 0.9401913875598086, 0.672945205479452))

### (2)Logistic Regression

In [37]:
 def logistic_regression_new(training_file, development_file, counts):
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine(training_file, development_file, counts)      
        
        clf = LogisticRegression()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output

In [38]:
logistic_regression_new("complex_words_training.txt","complex_words_development.txt", counts)

((0.7244986922406277, 0.6421947449768161, 0.6808684965178206),
 (0.7313829787234043, 0.6578947368421053, 0.6926952141057935))

### (3) Random Forest 

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
 def random_forest(training_file, development_file, counts):
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine(training_file, development_file, counts)      
        
        # we have five features
        clf = RandomForestClassifier(max_depth = 8)
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output 

In [41]:
random_forest("complex_words_training.txt","complex_words_development.txt", counts)

((0.7891304347826087, 0.8415765069551777, 0.8145100972326104),
 (0.6962025316455697, 0.7894736842105263, 0.7399103139013453))

### (4)Decision Tree 

In [42]:
from sklearn import tree

In [43]:
 def decision_tree(training_file, development_file, counts):
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine(training_file, development_file, counts)      
        
        clf = tree.DecisionTreeClassifier()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output 

In [44]:
decision_tree("complex_words_training.txt","complex_words_development.txt", counts)

((1.0, 1.0, 1.0), (0.6333333333333333, 0.6363636363636364, 0.6348448687350836))

#### We can see that decision tree overfits the training data

### (5) KNN

In [45]:
from sklearn.neighbors import KNeighborsClassifier

In [46]:
 def knn(training_file, development_file, counts):
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine(training_file, development_file, counts)      
        
        clf = KNeighborsClassifier()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output# 

In [47]:
knn("complex_words_training.txt","complex_words_development.txt", counts)

((0.7596463022508039, 0.7302936630602782, 0.7446808510638298),
 (0.6339712918660287, 0.6339712918660287, 0.6339712918660287))

### (6) Support Vector Machine 

In [48]:
from sklearn import svm

In [49]:
 def support_vector_machine(training_file, development_file, counts):
        train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine(training_file, development_file, counts)      
        
        clf = svm.SVC()
        clf.fit(train_x, train_y_true)
        
        train_y_pred = clf.predict(train_x)
        train_y_pred_l = train_y_pred.tolist()
        
        tprecision = get_precision(train_y_pred_l, train_y_true)
        trecall = get_recall(train_y_pred_l, train_y_true)
        tfscore = get_fscore(train_y_pred_l, train_y_true)
          
        dev_y_pred = clf.predict(dev_x)
        dev_y_pred_l = dev_y_pred.tolist()
        
        dprecision = get_precision(dev_y_pred_l, dev_y_true)
        drecall = get_recall(dev_y_pred_l, dev_y_true)
        dfscore = get_fscore(dev_y_pred_l, dev_y_true)
        
        
        training_performance = (tprecision, trecall, tfscore)
        development_performance = (dprecision, drecall, dfscore)

        return training_performance, development_performance

#### The precision, recall, and f-score on training data is in the first line of the following output. 
#### The precision, recall, and f-score on development data is in the second line of the following output 

In [50]:
support_vector_machine("complex_words_training.txt","complex_words_development.txt", counts)

((0.7254575707154742, 0.6738794435857806, 0.6987179487179487),
 (0.7186700767263428, 0.6722488038277512, 0.6946847960444994))

## In all, I find that Random Forest Model gives me the best result based on f1-scores. So I will combine the training data and development data as one to train the Random Forest Model, and make it to predict the testing data.

### Load the unlabeled data file 

In [51]:
def load_file_test(data_file):
    words = []
    sentence = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                sentence_str = line_split[1].split()
                sentence_list = [i for i in sentence_str if i not in ",.!?:;@#$%^&*()``~''--+=|\/<>"]   
                sentence.append(sentence_list)
            i += 1
    return words, sentence

### Do feature engineering for the predicting dataset 

In [52]:
def make_data_matrix_for_test(training_file, test_file, counts):
    
    # Load training data first to get the means and standard deviations of each feature
    words, y_true, sentences = load_file_new(training_file)
    n = len(words)
    words_len_l = []
    words_freq_l = []
    words_syllable_l = []
    words_synonyms_count_l = []
    words_antonyms_count_l = []
    sent_len_l = []
    sent_words_avg_len_l = []
    sent_words_avg_freq_l = []
    
    for i in range(n):
        words_len_l += [len(words[i])]
        words_freq_l += [counts[words[i]]] 
        words_syllable_l += [count_syllables(words[i])]
        words_synonyms_count_l += [count_syn_ant(words[i])[0]]
        words_antonyms_count_l += [count_syn_ant(words[i])[1]]
        sent_len_l += [sentence_words_number(sentences[i])]
        sent_words_avg_len_l += [sentence_words_avg_len(sentences[i])]
        sent_words_avg_freq_l += [sentence_avg_word_freq(sentences[i], counts)]
    
    # get the means and standard deviations of each feature
    words_len_l_mean = np.mean(words_len_l)
    words_freq_l_mean = np.mean(words_freq_l)
    words_syllable_l_mean = np.mean(words_syllable_l)
    words_synonyms_count_l_mean = np.mean(words_synonyms_count_l)
    words_antonyms_count_l_mean = np.mean(words_antonyms_count_l)       
    sent_len_l_mean = np.mean(sent_len_l)
    sent_words_avg_len_l_mean = np.mean(sent_words_avg_len_l)
    sent_words_avg_freq_l_mean = np.mean(sent_words_avg_freq_l)
    
    words_len_l_sd = np.std(words_len_l)
    words_freq_l_sd = np.std(words_freq_l)
    words_syllable_l_sd = np.std(words_syllable_l)
    words_synonyms_count_l_sd = np.std(words_synonyms_count_l)
    words_antonyms_count_l_sd = np.std(words_antonyms_count_l)
    sent_len_l_sd = np.std(sent_len_l)
    sent_words_avg_len_l_sd = np.std(sent_words_avg_len_l)
    sent_words_avg_freq_l_sd = np.std(sent_words_avg_freq_l)
    
    
    # load unlabeled tesing data            
    t_words,  t_sentences = load_file_test(test_file)
    t_n = len(t_words)
    t_words_len_l = []
    t_words_freq_l = []
    t_words_syllable_l = []
    t_words_synonyms_count_l = []
    t_words_antonyms_count_l = []
    t_sent_len_l = []
    t_sent_words_avg_len_l = []
    t_sent_words_avg_freq_l = []
    
    # add features
    for i in range(t_n):
        t_words_len_l += [len(t_words[i])]
        t_words_freq_l += [counts[t_words[i]]] 
        t_words_syllable_l += [count_syllables(t_words[i])]
        t_words_synonyms_count_l += [count_syn_ant(t_words[i])[0]]
        t_words_antonyms_count_l += [count_syn_ant(t_words[i])[1]]
        t_sent_len_l += [sentence_words_number(t_sentences[i])]
        t_sent_words_avg_len_l += [sentence_words_avg_len(t_sentences[i])]
        t_sent_words_avg_freq_l += [sentence_avg_word_freq(t_sentences[i], counts)]
    
    # normalization
    for i in range(t_n):
        t_words_len_l[i] = (t_words_len_l[i] - words_len_l_mean)/words_len_l_sd
        t_words_freq_l[i] = (t_words_freq_l[i] - words_freq_l_mean)/words_freq_l_sd
        t_words_syllable_l[i] = (t_words_syllable_l[i] - words_syllable_l_mean)/words_syllable_l_sd
        t_words_synonyms_count_l[i] = (t_words_synonyms_count_l[i] - words_synonyms_count_l_mean)/words_synonyms_count_l_sd
        t_words_antonyms_count_l[i] = (t_words_antonyms_count_l[i] - words_antonyms_count_l_mean)/words_antonyms_count_l_sd                    
        t_sent_len_l[i] = (t_sent_len_l[i] - sent_len_l_mean)/sent_len_l_sd
        t_sent_words_avg_len_l[i] = (t_sent_words_avg_len_l[i] - sent_words_avg_len_l_mean)/sent_words_avg_len_l_sd
        t_sent_words_avg_freq_l[i] = (t_sent_words_avg_freq_l[i] - sent_words_avg_freq_l_mean)/sent_words_avg_freq_l_sd
    
                            
    t_train = []
    
    for i in range(t_n):
        t_single_row = []
        t_single_row += [t_words_len_l[i]]
        t_single_row += [t_words_freq_l[i]]
        t_single_row += [t_words_syllable_l[i]]
        t_single_row += [t_words_synonyms_count_l[i]]
        t_single_row += [t_words_antonyms_count_l[i]]                    
        t_single_row += [t_sent_len_l[i]]
        t_single_row += [t_sent_words_avg_len_l[i]]
        t_single_row += [t_sent_words_avg_freq_l[i]]
        t_train += [t_single_row]
    
    return np.array(t_train)


### Combine training and development datasets, train random forest model on it, and make prediction on unlabeled testing data

In [53]:
train_x, train_y_true, dev_x, dev_y_true = make_data_matrix_mine("complex_words_training.txt","complex_words_development.txt", counts)      
x_total = np.concatenate((train_x, dev_x), axis=0)
y_true_total = np.concatenate((train_y_true, dev_y_true), axis=0)

clf = RandomForestClassifier(max_depth=8)
clf.fit(x_total, y_true_total)

y_pred =  clf.predict(x_total)
precision = get_precision(y_pred, y_true_total)
recall = get_recall(y_pred, y_true_total)
fscore = get_fscore(y_pred, y_true_total)

### The precision, recall and f1-score based on model trained and test on combined model are the following output

In [54]:
precision, recall, fscore

(0.8006737787759686, 0.8329439252336449, 0.8164901231033497)

### Make prediction on unlabeld data and save output

In [55]:
test_y = make_data_matrix_for_test("complex_words_training.txt", "complex_words_test_unlabeled.txt", counts)
final_result = clf.predict(test_y)

In [56]:
np.savetxt("Ziran_Min_Predicted Results.txt",final_result,fmt='%i')