# Using BART to summarize readme file from github repos
Many LLM models are available, but why i choose BART is because its more relevant to summarizing task. And also, no API needed. Just load the trained models and let the magic happen.

In [None]:
import requests
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"  
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Load the metrics DataFrame
metrics_df = pd.read_csv('all_github_metrics_withclosedopen_issues_prs_usedby.csv')

# Function to fetch README file
def fetch_readme(url, github_token):
    try:
        # Extract the owner and repo name from the URL
        parts = url.split('/')
        owner = parts[-2]
        repo = parts[-1]
        
        # GitHub API URL for the README
        api_url = f'https://api.github.com/repos/{owner}/{repo}/readme'
        headers = {
            'Accept': 'application/vnd.github.v3.raw',
            'Authorization': f'token {github_token}'  # Include GitHub token for authentication
        }
        
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        
        return response.text  # Return the README content
    except Exception as e:
        print(f"Error fetching README for {url}: {e}")
        return None

# Function to summarize README using BART
def summarize_readme(readme_content):
    if readme_content:
        inputs = tokenizer.encode("summarize: " + readme_content, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=50, min_length=25, length_penalty=2.0, num_beams=4, early_stopping=True)
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return "No content to summarize"

# Process only the first three rows of the DataFrame
github_token = 'github token'  # Your GitHub token




model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

                                            url  is_private  has_homepage  \
0           https://github.com/zloirock/core-js       False         False   
1  https://github.com/sindresorhus/p-cancelable       False         False   
2              https://github.com/fb55/entities       False         False   

    size  stars  watchers  has_projects  has_pages  has_wiki  has_discussions  \
0  62787  24762     24762         False       True     False             True   
1     59    441       441         False      False     False            False   
2   4549    331       331         False      False     False            False   

   ...  log_stars  log_watchers  log_forks  log_commit_count  \
0  ...  10.117065     10.117065   7.414573          6.234411   
1  ...   6.089045      6.089045   3.091042          6.536692   
2  ...   5.802118      5.802118   4.158883          6.802395   

   open_issues_count closed_issues_count open_prs_count  closed_prs_count  \
0                 33            

In [3]:

summaries = []
for url in metrics_df['url']:
    readme_content = fetch_readme(url, github_token)
    summary = summarize_readme(readme_content)
    summaries.append(summary)

# Insert summaries into the second column of metrics_df
metrics_df['summary'] = summaries  # Add the summaries as a new column

# Step 9: Save the updated metrics DataFrame to a new CSV file
metrics_df.to_csv('metrics_with_summary.csv', index=False)

# Print the updated metrics DataFrame
print(metrics_df)


Error fetching README for https://github.com/prysmaticlabs/protoc-gen-go-cast: 404 Client Error: Not Found for url: https://api.github.com/repos/prysmaticlabs/protoc-gen-go-cast/readme
                                                   url  is_private  \
0                  https://github.com/zloirock/core-js       False   
1         https://github.com/sindresorhus/p-cancelable       False   
2                     https://github.com/fb55/entities       False   
3                  https://github.com/paradigmxyz/reth       False   
4                      https://github.com/yarnpkg/yarn       False   
..                                                 ...         ...   
161                   https://github.com/rollup/rollup       False   
162       https://github.com/ajv-validator/ajv-formats       False   
163  https://github.com/ethereumjs/ethereumjs-block...       False   
164                   https://github.com/facebook/fbjs       False   
165                  https://github.com/react

#### One repo link https://github.com/prysmaticlabs/protoc-gen-go-cast doesnt have readme file