# Homework 1: Six Degrees of Kevin Bacon

Read [bfs_six_degrees.pdf](bfs_six_degrees.pdf) for the assignment spec. Submit this .ipynb to the professor to demonstrate your solution. This is unofficial and is maintained by a fellow student, Justin, aka the glizzy goblin.

In [None]:
from graph_tools import graph
from collections import deque

In [14]:
# Helper Functions
def process_credits_file_to_graph(graph:graph, file_path:str):
    try:
        with  open(file_path, 'r', encoding='utf-8') as list_edges:
            for line in list_edges:
                process_line(graph, line)
    except FileNotFoundError:
        print(f"Error: was not able to find {file_path}")
    except UnicodeDecodeError as e:
        print(f"Encoding error while reading the file: {e}")

def process_line(graph:graph, line:str):
    split_line = line.strip().split()
    if not split_line: return
    
    movie_name, actors = split_line[0], split_line[1:]
    
    for actor_1 in actors:
        co_stars = [actor for actor in actors if actor != actor_1]
        for actor_2 in co_stars:
            graph.Add_und_edge(actor_1, actor_2, movie_name)

In [15]:
small_actor_graph = graph()
small_file_path = "smaller_imdb_cleaned.txt"
process_credits_file_to_graph(small_actor_graph, small_file_path)

In [16]:
large_actor_graph = graph()
large_actor_file = "all_imdb_cleaned.txt"
process_credits_file_to_graph(large_actor_graph, large_actor_file)

In [17]:
def format_output(merged_result:list):
    if not merged_result: return "No path found"
    out=""
    for i in range(len(merged_result)):
        if i % 2 == 1:
            out += f"-({merged_result[i]})-"
            continue
        out += merged_result[i]
    return out

def reconstruct_path(adj, current, graph):
        # reconstruct path from start to end given adjacency list        
        total_path = []
        while current in adj and current != None:
            total_path.append(current)
            current = adj[current]
        total_path = total_path[::-1]
        movies = [graph.adj_list[total_path[i]][total_path[i + 1]] for i in range(len(total_path) - 1)]
        
        # merge results between actor names and movie edges
        merged_result = [item for pair in zip(total_path, movies) for item in pair]
        merged_result.extend(total_path[len(movies):])

        # format oout put as string
        return format_output(merged_result)        

**Vanilla BFS**

In [18]:
def bfs(start:str, end:str, graph:graph):
    if start not in graph.vertices or end not in graph.vertices:
        return "Not present"    
    queue = deque([])
    visited = {actor: False for actor in graph.vertices}
    pred = {actor: None for actor in graph.vertices}
    
    visited[start] = True
    queue.append(start)
    # while queue is not empty
    while queue:
        # pop from queue
        current = queue.popleft()
        if(current == end):
            return reconstruct_path(adj=pred, current=current, graph=graph)
        for neighbor in graph.adj_list[current]:
                if not visited[neighbor]:
                    visited[neighbor] = True
                    pred[neighbor] = current
                    queue.append(neighbor)
               
                
    return "Not present"
     

**Bi-Directional BFS**

In [19]:
# Refrenced https://www.geeksforgeeks.org/bidirectional-search/

def reconstruct_bidi_path(start_pred, end_pred, start,end, mid, graph):
    
	total_path = []
	current = mid
	while current in start_pred and current != None:
		total_path.append(current)
		current = start_pred[current]
  
	total_path = total_path[::-1]
	current = end_pred[mid]
 
	while current in end_pred and current != None:
		total_path.append(current)
		current = end_pred[current]
  
	movies = [graph.adj_list[total_path[i]][total_path[i + 1]] for i in range(len(total_path) - 1)]

	# merge results between actor names and movie edges
	merged_result = [item for pair in zip(total_path, movies) for item in pair]
	merged_result.extend(total_path[len(movies):])

	# format oout put as string
	return format_output(merged_result)

def BiDi_BFS(start: str, end: str, graph: graph, bool):
	if start not in graph.vertices or end not in graph.vertices:
		return "Not present"
	start_queue = deque([])
	end_queue = deque([])

	# initializes start and end visted nodes as false
	start_visited = {actor: False for actor in graph.vertices}
	end_visited = {actor: False for actor in graph.vertices}

	#initialize start and end adj list
	start_pred = {actor: None for actor in graph.vertices}
	end_pred = {actor: None for actor in graph.vertices}
	
	start_queue.append(start)
	start_visited[start] = True
 
	end_queue.append(end)
	end_visited[end] = True
 
	while len(start_queue) > 0  and len(end_queue) > 0:
		# Forward BFS
		curr_start = start_queue.popleft()
		for neighbor in graph.adj_list[curr_start]:
			if not start_visited[neighbor]:
				start_visited[neighbor] = True
				start_pred[neighbor] = curr_start
				start_queue.append(neighbor)
			if(end_visited[neighbor]): # intersection has been found
				return reconstruct_bidi_path(start_pred, end_pred, start, end, neighbor, graph)
			
		# Backward BFS
		curr_end = end_queue.popleft()
		for neighbor in graph.adj_list[curr_end]:
			if not end_visited[neighbor]:
				end_visited[neighbor] = True
				end_pred[neighbor] = curr_end
				end_queue.append(neighbor)
			if(start_visited[neighbor]): # intersection has been found
				return reconstruct_bidi_path(start_pred, end_pred, start, end, neighbor, graph)
	
	return "No Path Found"

## What You Need to Find

This section outlines the specific questions you need to answer. Ensure each question is addressed in a separate code block, with the output clearly presented as either text or visual plots.



1. **Shortest Paths**: For each credits file, find the shortest paths for all the pairs provided in the example input files. Handle corner cases such as:
   - An actor not being present in the graph.
   - The source and destination actors not being connected.  
   Output appropriate messages for these cases.



In [None]:
input_file = "more-input.txt"
output_file = "more-output.txt"

input_parameters = open(input_file, 'r', encoding='utf-8')
expected_output = open(output_file, 'r', encoding='utf-8')

print("BFS Output\n")
for param, out in zip(input_parameters, expected_output):
    striped_param = param.strip().split()
    start, end = striped_param[0], striped_param[1]
    path = bfs(start=start, end=end, graph=small_actor_graph)
    print(f"small actor graph: {path}\n")

In [None]:
input_file = "more-input.txt"
output_file = "more-output.txt"

input_parameters = open(input_file, 'r', encoding='utf-8')
expected_output = open(output_file, 'r', encoding='utf-8')

print("Bi-Directional BFS Output\n")
for param, out in zip(input_parameters, expected_output):
    striped_param = param.strip().split()
    start, end = striped_param[0], striped_param[1]
    path = BiDi_BFS(start=start, end=end, graph=small_actor_graph)
    print(f"{path}\n")
    

2. **Interesting Paths**: Identify and include some interesting shortest paths discovered in either credits file.


In [None]:
# small graph 
for actor in small_actor_graph.vertices():
    co_star = [a for a in small_actor_graph.vertices() ]


3. **Path Differences**: Analyze how the shortest paths differ between the same vertex pairs across both credits files. Highlight cases where one file leads to shorter paths. Use random pairs to demonstrate the differences.


In [None]:
input_file = "more-input.txt"
output_file = "more-output.txt"

input_parameters = open(input_file, 'r', encoding='utf-8')
expected_output = open(output_file, 'r', encoding='utf-8')

print("BFS Output\n")
for param, out in zip(input_parameters, expected_output):
    striped_param = param.strip().split()
    start, end = striped_param[0], striped_param[1]
    path = bfs(start=start, end=end, graph=small_actor_graph)
    print(f"small actor graph: {path}\n")
    
    path = bfs(start=start, end=end, graph=large_actor_graph)
    print(f"large actor graph: {path}\n")


4. **Kevin Bacon's Level Sets**: For Kevin Bacon or other notable starting actors, calculate the sizes of the level sets (sets of vertices at distance 1, 2, 3, etc.). Present the results in a plot.



5. **Connected Components**: Determine the sizes of the connected components in each graph.



6. **Edge Count Comparison**: Compare the number of edges traversed between vanilla BFS and Bidirectional BFS (Bi-BFS) for the pairs in the example input files. Include your own generated examples in this comparison.



7. **Wall-Clock Time**: Measure and compare the actual running times of BFS and Bi-BFS using the `timeit` module in Python. Provide wall-clock time data for the pairs in the example input files, as well as your own examples.
