In [2]:
# Preprocessing
def preprocess(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
                 "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
                 "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs",
                 "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
                 "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
                 "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or",
                 "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
                 "between", "into", "through", "during", "before", "after", "above", "below", "to",
                 "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further",
                 "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
                 "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
                 "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will",
                 "just", "don", "should", "now"]
    preprocessed_sentences = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stopwords]
        preprocessed_sentence = " ".join(filtered_words)
        preprocessed_sentences.append(preprocessed_sentence)

    return preprocessed_sentences


In [3]:
def extract_features(sentences):
    features = []
    for sentence in sentences:
        sentence_length = len(sentence.split())
        num_words = len(re.findall(r'\w+', sentence))

        if num_words == 0:
            avg_word_length = 0
        else:
            avg_word_length = sum(len(word) for word in sentence.split()) / num_words

        num_characters = len(sentence)
        num_digits = len(re.findall(r'\d', sentence))
        num_special_chars = len(re.findall(r'[^\w\s]', sentence))
        contains_keyword = int('keyword' in sentence.lower())

        num_uppercase = len(re.findall(r'[A-Z]', sentence))
        num_lowercase = len(re.findall(r'[a-z]', sentence))
        num_punctuation = len(re.findall(r'[^\w\s]', sentence))

        features.append([sentence_length, num_words, avg_word_length,
                         num_characters, num_digits, num_special_chars, contains_keyword,
                         num_uppercase, num_lowercase, num_punctuation])

    return features

In [4]:
# Sentence Selection
def select_sentences(sentences, scores, num_sentences):
    scores = np.array(scores)
    indices = np.argsort(scores)[::-1][:num_sentences]
    ranked_sentences = [(sentences[i], scores[i]) for i in indices]
    ranked_sentences = sorted(ranked_sentences, key=lambda x: x[1], reverse=True)
    selected_sentences = [sentence for sentence, score in ranked_sentences]
    return selected_sentences


In [5]:
# Summary Generation
def generate_summary(sentences):
    summary = ' '.join(sentences)
    return summary

In [6]:
# GUI Functions
def summarize_text():
    text = input_text_box.get("1.0", tk.END)
    num_sentences = int(num_summary_sentences_entry.get())

    # Preprocess the document
    preprocessed_sentences = preprocess(text)
    # Extract features from the preprocessed sentences
    features = extract_features(preprocessed_sentences)
    # Find the most occurring word in the document
    word_counter = Counter(" ".join(preprocessed_sentences).split())
    most_common_word = word_counter.most_common(1)[0][0]
    # Generate training sentences and scores based on the most occurring word
    train_sentences = [sentence for sentence in preprocessed_sentences if most_common_word in sentence]
    train_scores = [word_counter[most_common_word]] * len(train_sentences)
    # Train a regression model using labeled data
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_sentences)
    # Manually fit the regression model
    regression_model = LinearRegression()
    regression_model.coef_ = np.zeros(X_train.shape[1])  # Set the coefficients to zeros
    regression_model.intercept_ = 0
    # Apply the regression model to score the sentences in the document
    X_test = vectorizer.transform(preprocessed_sentences).toarray()
    predicted_scores = np.dot(X_test, regression_model.coef_) + regression_model.intercept_
    # Select the top-ranked sentences
    summary_sentences = select_sentences(preprocessed_sentences, predicted_scores, num_sentences=num_sentences)
    # Generate the summary
    summary = generate_summary(summary_sentences)

    output_text_box.delete("1.0", tk.END)
    output_text_box.insert(tk.END, summary)


In [7]:
def clear_text():
    input_text_box.delete("1.0", tk.END)
    output_text_box.delete("1.0", tk.END)

In [8]:
def show_about():
    messagebox.showinfo("About", "This is a text summarizer GUI created using Tkinter.")


In [9]:
# Create the main window
window = tk.Tk()
window.title("Text Summarizer")

''

In [10]:
# Set the background color of the window
window.configure(background="pink")


In [11]:
# Create input text box
input_label = tk.Label(window, text="Input Text")
input_label.grid(row=0, column=0, padx=10, pady=5, sticky=tk.W)
input_text_box = scrolledtext.ScrolledText(window, width=75, height=25)
input_text_box.grid(row=1, column=0, padx=10, pady=5, sticky=tk.W)


In [12]:
# Create output text box
output_label = tk.Label(window, text="Summarized Text")
output_label.grid(row=0, column=1, padx=10, pady=5, sticky=tk.W)
output_text_box = scrolledtext.ScrolledText(window, width=75, height=25)
output_text_box.grid(row=1, column=1, padx=10, pady=5, sticky=tk.W)

In [13]:
# Create number of sentences label and entry
num_summary_sentences_label = tk.Label(window, text="Number of Sentences:")
num_summary_sentences_label.grid(row=2, column=0, padx=10, pady=5, sticky=tk.W)
num_summary_sentences_entry = tk.Entry(window, width=5)
num_summary_sentences_entry.grid(row=2, column=0, padx=10, pady=5, sticky=tk.E)


In [14]:
# Set default value for the number of sentences
num_summary_sentences_entry.insert(tk.END, "3")

In [15]:
# Create buttons
summarize_button = tk.Button(window, text="Summarize", width=10, command=summarize_text)
summarize_button.grid(row=2, column=1, padx=10, pady=5, sticky=tk.W)
clear_button = tk.Button(window, text="Clear", width=10, command=clear_text)
clear_button.grid(row=2, column=1, padx=10, pady=5, sticky=tk.E)

In [16]:
# Create menu bar
menu_bar = tk.Menu(window)
window.config(menu=menu_bar)
