In [54]:
import chardet
import os
import pandas as pd

In [55]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()

    result = chardet.detect(raw_data)
    return result['encoding']

def srt_to_text(srt_file_path):
    encoding_attempts = ['utf-8', 'ISO-8859-1', 'utf-16', ]

    for encoding in encoding_attempts:
        try:
            with open(srt_file_path, 'r', encoding=encoding) as file:
                lines = file.readlines()

            text = ""
            current_line = ""
            is_time_line = True

            for line in lines:
                line = line.strip()

                if not line:
                    is_time_line = True
                    continue

                if is_time_line:
                    is_time_line = False
                    continue

                current_line += line + " "

                # If the line ends with a punctuation indicating the end of a sentence
                if line.endswith(('.', '!', '?')):
                    text += current_line + "\n"
                    current_line = ""

            return text.strip()
        except UnicodeDecodeError:
            None

    # If none of the encodings work
    print(f"Unable to decode file: {srt_file_path}")
    return None

def process_srt_files_in_folder(folder_path):
    result_dict = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".srt"):
            srt_file_path = os.path.join(folder_path, filename)
            text_content = srt_to_text(srt_file_path)

            if text_content is not None:
                file_dict = {"name": filename, "text": text_content}
                result_dict.append(file_dict)

    return result_dict

# Example usage for a folder containing SRT files
folder_path = '/Users/wally/Library/CloudStorage/OneDrive-Personal/Documents/Cornell/1. Fourth Year/INFO 6350 - Text Mining History and Literature/INFO6350_Final_Project/srt_files'
result_dictionary = process_srt_files_in_folder(folder_path)

In [56]:
movie_subtitles_df = pd.DataFrame(result_dictionary)

In [57]:
# Removal of timestamps
movie_subtitles_df['text'] = movie_subtitles_df['text'].str.replace(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}[\s-]+', '\n', regex=True)

In [60]:
movie_subtitles_df

Unnamed: 0,name,text
0,Sunset Blvd. (1950)_english.srt,"\nYes, this is Sunset Boulevard, Los Angeles, ..."
1,Diner.1982.720p.BluRay.x264-AMIABLE.srt,\nA little bit softer now - Shout \nA little b...
2,Tender Mercies (1983).srt,"\nHere, give me the bottle. \n- Go to hell. \n..."
3,Ghandi.1982.1080p.BluRay.x264.YIFY.srt,\nThere are more than yesterday. \n\nHe will b...
4,Ace in the hole (1951) Eng.srt,\nHey. Pull up at the corner. \n\nWait here. \...
5,David and Bathsheba 1951 English.srt,\nIRA: Wait here. \n\nLord Commander. \n\nWell...
6,Silkwood.1983.BluRay.720p.1.1GB.x264.FunCinema...,\nName? \n- Karen Silkwood. \n\nDrew Stephens....
7,Passport To Pimlico (1949).srt,\n'You've been listening to a programme of lun...
8,An American in Paris (1951) Eng.srt,\nThis is Paris. \n\nAnd I'm an American who l...
9,Beverly.Hills.Cop.1984.REMASTERED.720p.BluRay....,\n[♪ Glenn Frey: <i>The Heat Is On</i>] \nTruc...
