In [37]:
import pandas as pd
import numpy as np
import math
data = pd.read_csv("../recommend/movies.csv")
# Select relevant columns
data = data[["original_title", "genres", "director", "cast"]].loc[:10,:]
data["label"] = np.random.randint(0, 2, len(data)) 

def string_to_array(x):
    x = x.strip("[]").replace("'","").replace(' ','')
    x = x.split(",")
    return x

data['genres'] = data['genres'].apply(string_to_array)
data['director'] = data['director'].apply(string_to_array)
data['cast'] = data['cast'].apply(string_to_array)


In [38]:


for index, row in data.iterrows():
    genres, directors, casts = row['genres'], row['director'], row['cast']
    data[genres] = np.zeros((data.shape[0], len(genres)))
    data[directors] = np.zeros((data.shape[0], len(directors)))
    data[casts] = np.zeros((data.shape[0], len(casts)))
    for genre, director, cast in zip(genres, directors, casts):
        data.loc[index, genre] = 1
        data.loc[index, director] = 1
        data.loc[index, cast] = 1

columns_to_exclude = ["original_title", "genres", "director", "label", "cast"]
X_train = data.loc[:, ~data.columns.isin(columns_to_exclude)]
y_train = data.loc[:, "label"]

X_train.to_csv("x_train.csv", index=False)
print(X_train.head)
print(y_train.head)

<bound method NDFrame.head of     Action  Adventure  Fantasy  ScienceFiction  JamesCameron  SamWorthington  \
0      0.0        0.0      0.0             0.0           1.0             1.0   
1      0.0        0.0      0.0             0.0           0.0             0.0   
2      0.0        0.0      0.0             0.0           0.0             0.0   
3      0.0        0.0      0.0             0.0           0.0             0.0   
4      0.0        0.0      0.0             0.0           0.0             0.0   
5      0.0        0.0      0.0             0.0           0.0             0.0   
6      0.0        0.0      0.0             0.0           0.0             0.0   
7      0.0        0.0      0.0             0.0           0.0             0.0   
8      0.0        0.0      0.0             0.0           0.0             0.0   
9      0.0        0.0      0.0             0.0           0.0             0.0   
10     0.0        1.0      0.0             0.0           0.0             0.0   

    Sigou

In [39]:
class Node():
    def __init__(self, feature=None, left=None, right=None, gain=None, value=None):
        self.feature = feature
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value
    
class DecisionTree():
    def __init__(self, min_samples=2, max_depth=2):
        self.min_samples = min_samples
        self.max_depth = max_depth

    def compute_entropy(self, y):
        entropy = 0
        if len(y) != 0:
            p1 = len(y[y==1])/ len(y)
            if p1!=0 and p1!=1:
                entropy = -p1*np.log2(p1) - (1-p1)*np.log2(1-p1)

        return entropy
    
    def compute_leaf_value(self, y):
        label = y.value_counts().idxmax()
        return label

    # split the node
    def split_dataset(self, X, node_indices, feature):
        left_indices = []
        right_indices = []

        for i in node_indices:
            if X.loc[i][feature] == 1.0:
                left_indices.append(i)
            else:
                right_indices.append(i)
        
        return left_indices, right_indices

    # Calculate information gain
    def compute_information_gain(self, X, y, node_indices, feature):
        left_indices, right_indices = self.split_dataset(X, node_indices, feature)
        X_node, y_node = X.loc[node_indices], y.loc[node_indices]
        X_left, y_left = X.loc[left_indices], y.loc[left_indices]
        X_right, y_right = X.loc[right_indices], y.loc[right_indices]

        entropy_node = self.compute_entropy(y_node)
        entropy_left = self.compute_entropy(y_left)
        entropy_right = self.compute_entropy(y_right)    

        w_left = len(left_indices)/ len(node_indices) 
        w_right = len(right_indices)/ len(node_indices)

        information_gain = entropy_node - w_left*entropy_left - w_right*entropy_right

        return information_gain

    # get best split
    def get_best_split(self, X, y, node_indices):
        max_info_gain = -1
        best_feature = ""
        features = X.columns
        for feature in features:
            ig = self.compute_information_gain(X, y, node_indices, feature)
            if ig > max_info_gain:
                max_info_gain = ig
                best_feature = feature
        
        return best_feature
    
    def build_tree_recursive(self, X, y, node_indices, branch_name, max_depth, current_depth):
        X_node, y_node = X.loc[node_indices], y.loc[node_indices]
        feature = self.get_best_split(X, y, node_indices)
        ig = self.compute_information_gain(X, y, node_indices, feature)
        left_indices, right_indices = self.split_dataset(X, node_indices, feature)
        if current_depth <= max_depth:
            if ig > 0:    
                left = self.build_tree_recursive(X, y, left_indices, branch_name="left", max_depth=max_depth,
                                                current_depth=current_depth+1)
                right = self.build_tree_recursive(X, y, right_indices, branch_name="right", max_depth=max_depth,
                                                current_depth=current_depth+1)
                return Node(feature=feature, left=left, right=right)
        
        value = self.compute_leaf_value(y_node)
        return Node(feature=feature, value=value)
        
    def predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] == 1.0:
            return self.predict(x, node.left)  # Return the result of recursive call
        else:
            return self.predict(x, node.right)  # Return the result of recursive call


decision_tree = DecisionTree()
node_indices = [i for i in range(X_train.shape[0])]
root = decision_tree.build_tree_recursive(X_train, y_train, node_indices, branch_name="root", max_depth=2, current_depth=0)
print("root", root)
prediction = decision_tree.predict(X_train.loc[0], root)
print("prediction:", prediction)

import joblib
joblib.dump(root, "./root.joblib", compress=True)

root <__main__.Node object at 0x00000291800D46B0>
prediction: 0


['./y_train.joblib']

In [41]:
import joblib
import pandas as pd

class Node():
    def __init__(self, feature=None, left=None, right=None, gain=None, value=None):
        self.feature = feature
        self.left = left
        self.right = right
        self.gain = gain
        self.value = value
    
class DecisionTree():
    def __init__(self):
        self.root = joblib.load("root.joblib")

    def predict(self, x, node):
            if node.value is not None:
                return node.value
            if x[node.feature] == 1.0:
                return self.predict(x, node.left)  # Return the result of recursive call
            else:
                return self.predict(x, node.right)  # Return the result of recursive call
            
    def get_prediction(self, x):
        return self.predict(x, self.root)

if __name__ == "__main__":
    data = joblib.load("X_train.joblib")
    x = data.loc[0]
    decision_tree = DecisionTree()
    prediction = decision_tree.get_prediction(x)
    print("prediction:", prediction)

prediction: 0
