# GPU Information

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
# memory footprint support libraries/code

!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()

# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]

def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

printm() 

In [None]:
#!kill -9 -1

# Data Warehousing Setup



In [None]:
# Mount Google Drive folder

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Add models repos and dave space

%cd  /content/drive/My Drive/Models Running/Summarization

# Text Rank Model


In [None]:
! pip install networkx==1.11
! pip install graphviz==0.7.1
! pip install -U spacy==1.10.1
#! pip install statistics==1.0.3.5
! pip install datasketch==1.2.1 -U
#! pip install matplotlib==2.1 # Include if graph needs to be produced

In [None]:
#! rm -rf pytextrank

In [None]:
! git clone https://github.com/DerwenAI/pytextrank.git

In [None]:
! python -m spacy download en

In [None]:
% cd /content/drive/My Drive/Models Running/Summarization/pytextrank/pytextrank

import pandas as pd
import json

import networkx as nx
import pylab as plt

import pytextrank as ptr

import warnings
warnings.filterwarnings("ignore")

genre = pd.read_csv('/content/drive/My Drive/Models Running/Summarization/datasets/genre_final_for_summarization.tsv', sep='\t')

In [None]:
genre.drop(columns=['song', 'artist', 'genre', 'lyrics_nchar'], inplace=True)
genre.set_index('genre_encoded', inplace=True)

In [None]:
def create_summary(line):
    
  # Convert Dataframe's row into dictionary
  data = {}
  data['id'] = line.name
  data['text'] = line.values[0]
  
  # Save the dictionary into a temporary .json file
  with open('/content/drive/My Drive/Models Running/Summarization/pytextrank/dat/temp.json', 'w') as outfile:
    json.dump(data, outfile)
    
  
  # Stage 1 - Perform statistical parsing/tagging on a document in JSON format
  path_stage0 = '/content/drive/My Drive/Models Running/Summarization/pytextrank/dat/temp.json'
  path_stage1 = '/content/drive/My Drive/Models Running/Summarization/pytextrank/dat/pytextranko1.json'
  
  with open(path_stage1, 'w') as f:
    for graf in ptr.parse_doc(ptr.json_iter(path_stage0)):
        f.write("%s\n" % ptr.pretty_print(graf._asdict()))
        
  
  # Stage 2 - Collect and normalize the key phrases from a parsed document 
  path_stage2 = '/content/drive/My Drive/Models Running/Summarization/pytextrank/dat/pytextranko2.json'

  graph, ranks = ptr.text_rank(path_stage1)
  ptr.render_ranks(graph, ranks)

  with open(path_stage2, 'w') as f:
      for rl in ptr.normalize_key_phrases(path_stage1, ranks):
          f.write("%s\n" % ptr.pretty_print(rl._asdict()))

#   nx.draw(graph, with_labels=True)  
#   plt.show()

  
  # Stage 3 - Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank
  path_stage3 = '/content/drive/My Drive/Models Running/Summarization/pytextrank/dat/pytextranko3.json'

  kernel = ptr.rank_kernel(path_stage2)

  with open(path_stage3, 'w') as f:
      for s in ptr.top_sentences(kernel, path_stage1):
          f.write(ptr.pretty_print(s._asdict()))
          f.write("\n")

          
  # Stage 4 - Summarize a document based on most significant sentences and key phrases
  phrases = ", ".join(set([p for p in ptr.limit_keyphrases(path_stage2)]))
  sent_iter = sorted(ptr.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1])
  s = []

  for sent_text, idx in sent_iter:
      s.append(ptr.make_sentence(sent_text))

  graf_text = " ".join(s)
  
  return graf_text 

In [None]:
print("Starting with Genre...\n\n")
genre_textrank = genre.apply( lambda line: create_summary(line), axis=1) 

In [None]:
genre_textrank = genre_textrank[genre_textrank.str.len() >= 200]

genre_textrank.to_csv('/content/drive/My Drive/Models Running/Summarization/datasets/text_rank/genre.tsv', sep='\t', header=False, index_label=False)