In [7]:
#NB classifier using tf-idf as data processing method

import pandas as pd
import numpy
import warnings 
from sklearn.naive_bayes import GaussianNB
#from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from pandas.core.frame import DataFrame  #need?

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import csv

def tf_idf(data_train, data_valid, data_test):
    cv = CountVectorizer()
    cv.fit(data_train)
    word = cv.get_feature_names()
    v = TfidfVectorizer()
    train = v.fit_transform(data_train)
    valid = v.transform(data_valid)
    test = v.transform(data_test)
    train_weight = train.toarray()
    valid_weight = valid.toarray()
    test_weight = test.toarray()
    return train_weight, valid_weight, test_weight, word
    

def convert_to_usable_dataframe(weight, data_frame, word):
    instance_count, word_count = weight.shape
    for j in range(word_count):
        temp = []
        for i in range(instance_count):
            temp.append(weight[i, j])
        data_frame[word[j]] = temp
    data_frame = data_frame.drop(["tag", "movieId", "YTId", "year", "title"], axis = 1)
    return data_frame
            
warnings.filterwarnings('ignore')

#load data
train_features = pd.read_csv(open("train_features.tsv"), sep='\t')
train_labels = pd.read_csv(open("train_labels.tsv"), sep='\t')
valid_features = pd.read_csv(open("valid_features.tsv"), sep='\t')
valid_labels = pd.read_csv(open("valid_labels.tsv"), sep='\t')
test_features = pd.read_csv(open("NEW_test_features.tsv"), sep='\t')

#do tfidf, convert it to usable data
tf_idf_weight_train, tf_idf_weight_valid, tf_idf_weight_test, allWords = tf_idf(train_features.iloc[:, 4], valid_features.iloc[:, 4], test_features.iloc[:, 4])

#convert the dataframe to usable one
new_train_features = convert_to_usable_dataframe(tf_idf_weight_train, train_features, allWords)
new_valid_features = convert_to_usable_dataframe(tf_idf_weight_valid, valid_features, allWords)
new_test_features = convert_to_usable_dataframe(tf_idf_weight_test, test_features, allWords)

new_train_labels = train_labels.drop(["movieId"], axis = 1)
new_valid_labels = valid_labels.drop(["movieId"], axis = 1)

gnb = GaussianNB()
gnb.fit(new_train_features, new_train_labels.values.ravel()) #convert to array to fit the dataframe
labels_predict = gnb.predict(new_valid_features)

#print(classification_report(new_valid_labels, labels_predict))

labels_predict_test = gnb.predict(new_test_features)  #for kaggle


def output_csv(test_features, predict_test):
    data_frame = test_features.iloc[:, 0]
    temp = test_features
    temp.rename(columns={'title':'genres'}, inplace = True)
    temp_data_frame = temp.iloc[:, 1]
    x, y = test_features.shape
    for i in range(x):
        temp_data_frame.iloc[i] = predict_test[i]
    data = pd.concat([data_frame, temp_data_frame], axis=1)
    
    data.to_csv (r'/Users/liuyuting/2020 SM1 assignment/ML assignment/assignment 2/predict_test_labels_NB.csv', index = False, header=True)

output_csv(test_features, labels_predict_test)

report = classification_report(new_valid_labels, labels_predict)
print(report)



              precision    recall  f1-score   support

      Action       0.02      0.17      0.04         6
   Adventure       0.00      0.00      0.00         2
   Animation       0.03      0.67      0.06         3
    Children       0.06      0.67      0.11         3
      Comedy       0.50      0.03      0.05        38
       Crime       0.00      0.00      0.00         5
 Documentary       0.29      0.56      0.38        18
       Drama       0.44      0.09      0.15        43
     Fantasy       0.29      0.11      0.16        18
   Film_Noir       0.13      0.50      0.21         4
      Horror       0.15      0.25      0.19         8
     Musical       0.09      0.10      0.10        10
     Mystery       0.33      0.06      0.10        18
     Romance       0.50      0.02      0.04        51
      Sci_Fi       0.70      0.44      0.54        16
    Thriller       0.83      0.18      0.29        28
         War       0.43      0.14      0.21        21
     Western       0.00    

In [8]:
#NB classifier using one-hot as data processing method

import pandas as pd
import numpy
import warnings 
from sklearn.naive_bayes import GaussianNB
#from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from pandas.core.frame import DataFrame  #need?

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import csv

def all_different_tags(data_frame):
    all_different_tags = []
    tags_column = data_frame.iloc[:, 4]   #take out the tag column
    for instance in tags_column:
        for tag in instance.split(","):
            if tag not in all_different_tags:
                all_different_tags.append(tag)
    return all_different_tags
    
def one_hot_tag(data_frame, all_different_tags):
    instance_count, feature_count = data_frame.shape
    tags_column = data_frame.iloc[:, 4]
    for tag in all_different_tags:
        temp = []
        for i in range(instance_count):
            local_tags = tags_column[i].split(",")
            if tag in local_tags:
                temp.append(1)
            else:
                temp.append(0)
        data_frame[tag] = temp
    data_frame = data_frame.drop(["tag"], axis = 1)
    return data_frame
            
warnings.filterwarnings('ignore')

#load data
train_features = pd.read_csv(open("train_features.tsv"), sep='\t')
train_labels = pd.read_csv(open("train_labels.tsv"), sep='\t')
valid_features = pd.read_csv(open("valid_features.tsv"), sep='\t')
valid_labels = pd.read_csv(open("valid_labels.tsv"), sep='\t')
test_features = pd.read_csv(open("NEW_test_features.tsv"), sep='\t')

all_tags = all_different_tags(train_features)

#convert the dataframe to usable one
new_train_features = one_hot_tag(train_features, all_tags)
new_train_features = new_train_features.drop(["movieId", "YTId", "year", "title"], axis = 1)

new_valid_features = one_hot_tag(valid_features, all_tags)
new_valid_features = new_valid_features.drop(["movieId", "YTId", "year", "title"], axis = 1)

new_train_labels = train_labels.drop(["movieId"], axis = 1)
new_valid_labels = valid_labels.drop(["movieId"], axis = 1)

gnb = GaussianNB()
gnb.fit(new_train_features, new_train_labels.values.ravel()) #convert to array to fit the dataframe
labels_predict = gnb.predict(new_valid_features)

print(classification_report(new_valid_labels, labels_predict))

              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         6
   Adventure       0.00      0.00      0.00         2
   Animation       0.03      0.67      0.06         3
    Children       0.07      1.00      0.13         3
      Comedy       0.00      0.00      0.00        38
       Crime       0.00      0.00      0.00         5
 Documentary       0.22      0.56      0.32        18
       Drama       0.67      0.05      0.09        43
     Fantasy       0.44      0.22      0.30        18
   Film_Noir       0.06      0.25      0.10         4
      Horror       0.11      0.12      0.12         8
     Musical       0.14      0.10      0.12        10
     Mystery       1.00      0.06      0.11        18
     Romance       0.50      0.02      0.04        51
      Sci_Fi       0.56      0.31      0.40        16
    Thriller       1.00      0.04      0.07        28
         War       0.20      0.05      0.08        21
     Western       0.00    

In [9]:
#load data
train_features = pd.read_csv(open("train_features.tsv"), sep='\t')
train_labels = pd.read_csv(open("train_labels.tsv"), sep='\t')
valid_features = pd.read_csv(open("valid_features.tsv"), sep='\t')
valid_labels = pd.read_csv(open("valid_labels.tsv"), sep='\t')
test_features = pd.read_csv(open("NEW_test_features.tsv"), sep='\t')

#do tfidf, convert it to usable data
tf_idf_weight_train, tf_idf_weight_valid, tf_idf_weight_test, allWords = tf_idf(train_features.iloc[:, 4], valid_features.iloc[:, 4], test_features.iloc[:, 4])

#convert the dataframe to usable one
new_train_features = convert_to_usable_dataframe(tf_idf_weight_train, train_features, allWords)
new_valid_features = convert_to_usable_dataframe(tf_idf_weight_valid, valid_features, allWords)
new_test_features = convert_to_usable_dataframe(tf_idf_weight_test, test_features, allWords)

new_train_labels = train_labels.drop(["movieId"], axis = 1)
new_valid_labels = valid_labels.drop(["movieId"], axis = 1)

class_labels = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']


def convert_to_num(data, class_labels):
    temp = []
    ins, col = data.shape
    j = 0
    while j < ins:
        i = 0
        while i < 18:
            if data.iloc[j, 0] == class_labels[i]:
                data.iloc[j, 0] = i+1
            i = i+1
        
        j = j+1
        
    #data['num'] = temp
    return data

def convert_to_num2(predict, data, class_labels):
    temp = []
    ins = len(predict)
    j = 0
    while j < ins:
        i = 0
        while i < 18:
            if predict[j] == class_labels[i]:
                data.iloc[j] = i+1
                #predict[j] = i+1
            i = i+1
        
        j = j+1
        
    #data['num'] = temp
    return data

data1 = convert_to_num(new_valid_labels, class_labels)


#mlp classifier
gnb = GaussianNB()
gnb.fit(new_train_features, new_train_labels.values.ravel()) #convert to array to fit the dataframe
labels_predict = gnb.predict(new_valid_features)


temp_data = new_valid_features.iloc[:, 0]

data2 = convert_to_num2(labels_predict, temp_data, class_labels) #predict


data1.to_csv (r'/Users/liuyuting/2020 SM1 assignment/ML assignment/assignment 2/numerical_valid_labels_NB.csv', index = False, header=True)
data2.to_csv (r'/Users/liuyuting/2020 SM1 assignment/ML assignment/assignment 2/numerical_predict_valid_labels_NB.csv', index = False, header=True)


