In [1]:
from __future__ import print_function
from __future__ import division
import os
import sys

# Edit this to be the full path to the extracted Spark directory on your computer
spark_home = 'C:\Users\Vikaasa\IdeaProjects\VIT\spark-1.6.1-bin-hadoop2.6'
os.environ['SPARK_HOME'] = spark_home
sys.path.insert(0, spark_home + "/python")
sys.path.insert(0, os.path.join(spark_home, 'python\lib\py4j-0.9-src.zip'))

In [2]:
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
import json
import pandas as pd
import re
import nltk
from nltk.util import bigrams, trigrams
from nltk.util import ngrams
from nltk.corpus import stopwords
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector, DenseVector

df = pd.read_csv('speeches_dataset.csv')
df.to_json("speeches_dataset.json",orient='records')

In [4]:
df = sqlContext.read.json('speeches_dataset.json')
df_fillna=df.fillna("")
print(df_fillna.count())
print(df_fillna.printSchema())

181
root
 |-- speaker: string (nullable = false)
 |-- speech_id: long (nullable = true)
 |-- text: string (nullable = false)

None


In [5]:
df.take(1)

[Row(speaker=u'Obama', speech_id=1, text=u"The President. Hello, Parma! Hello, Ohio! Well, it is good to be in Parma. Everybody, have a seat who can sit down. [Laughter] It is good to be here. And this now, this is the right time for a rally. Now things have cooled off a little bit. And I had a beer in Amherst, at Ziggy's, so I'm feeling good, feeling steady.\nI want to first of all I want everybody to give Wendy a big round of applause for the introduction. We've got some outstanding Ohioans in the house. First of all, the mayor of Parma, Tim DeGeeter is here. Stand up, Tim, so everybody can see you. There you go. I don't know who the guy holding the mayor is, but [laughter] nah.\n\nAudience member. That's Jack!\n\nThe President. Oh, that's Jack! [Laughter] Got it.\n\nYou guys have some of the best Members of Congress from this area. And nobody is fighting harder than your outstanding Senator, Sherrod Brown. And his outstanding wife Connie, who I love. We love Connie. Although, Connie

In [6]:
speech_stopwords_list = list([line.strip() for line in open('speech_stopwords.txt', 'r')])
speech_stopwords_broadcasted = sc.broadcast(speech_stopwords_list)
nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords_broadcasted = sc.broadcast(nltk_stopwords)
more_stopwords = set([line.strip() for line in open('more_stopwords.txt', 'r')])
more_stopwords_broadcasted = sc.broadcast(more_stopwords)
ngram_value = 2
ngram_broadcasted = sc.broadcast(ngram_value)

In [7]:
def clean_up(s):
    text_removing_brackets = re.sub("[\(\[].*?[\)\]]", "", s)
    text_removing_double_quotes = re.sub('"',"",text_removing_brackets)
    speech_stopwords = speech_stopwords_broadcasted.value
    text_removing_stopwords = text_removing_double_quotes
    for token in speech_stopwords:
        text_removing_stopwords = re.sub(token,'',text_removing_stopwords)
    return text_removing_stopwords

def unicode_encode(s):
    return s.encode('utf-8','ignore')

def para_segmenter_and_cleanup(s):
    delimiters = "\n"
    delimiter_list='|'.join(map(re.escape, delimiters))
    paras=re.split(delimiter_list, s)
    paras_cleaned = [clean_up(sentence) for sentence in paras]
    paras_remove_other_speakers = [remove_other_speakers(sentence) for sentence in paras_cleaned]
    filtered_paras = filter(None, paras_remove_other_speakers)
    #sentences=nltk.sent_tokenize(s)
    #sentences = s.split(":")
    #return [w2v_cleanup(item) for item in sentences]
    return "\n".join(filtered_paras)

def check_stopword(tok,stopwords):
    if tok in stopwords:
        return "#"
    else:
        return tok
    
def tokenizer_and_stopword_remover(s):
    stopwords = nltk_stopwords_broadcasted.value
    token_list = s.lower().split()
    token_string = [check_stopword(x,stopwords) for x in token_list]
    #exclude_stopwords = lambda token : token not in stopwords
    token_string = " ".join(token_string)
    tokens = token_string.split("# ")
    return tokens

def remove_other_speakers(s):
    tokens = s.split()
    if len(tokens) > 1:
        if(':' in tokens[0] or '.' in tokens[0] or ':' in tokens[1] or '.' in tokens[1]):
            return None
        else:
            return s
    else:
        return None
    
def para_segmenter(s):
    delimiters = "\n"
    delimiter_list='|'.join(map(re.escape, delimiters))
    paras=re.split(delimiter_list, s)
    filtered_paras = filter(None, paras)
    #sentences=nltk.sent_tokenize(s)
    #sentences = s.split(":")
    #return [w2v_cleanup(item) for item in sentences]
    return filtered_paras

def speech_vocabulary(s):
    tokens=s.split()
    speech_len=len(tokens)
    #print(speech_len)
    vocab_len = (len(list(set(tokens))))
    #print(vocab_len)
    score=float(vocab_len/speech_len)
    #print(score)
    return score

def return_n_grams(s, n):
    #n=ngram_broadcasted.value
    print(n)
    ngrams_list=[]
    stopwords_removed = tokenizer_and_stopword_remover(s)
    #stopwords_removed=s
    for phrase in stopwords_removed:
        sixgrams = ngrams(phrase.split(), n)
        for grams in sixgrams:
            gram_phrase=""
            flag=0
            for tok in grams:
                if flag==0 and ',' not in tok and '.' not in tok and ';' not in tok and '!' not in tok:
                    gram_phrase = tok
                    flag=flag+1
                elif flag>0 and flag<n-1 and ',' not in tok and '.' not in tok and ';' not in tok and '!' not in tok:
                    gram_phrase = gram_phrase+" "+tok
                elif flag == n-1:
                    gram_phrase = gram_phrase+" "+tok
                else:
                    gram_phrase = None
                    flag=None
            if(gram_phrase != None):
                ngrams_list.append(re.sub("[^\w\-' ]", '', gram_phrase))
    return ngrams_list

In [8]:
def call_utf_encoder(df):
    utf_encoder_udf=udf(unicode_encode, StringType())
    df_cleaned = df.withColumn('speech_text_utf', utf_encoder_udf(df['text'])).drop(df['text'])
    print(df_cleaned.printSchema())
    print(df_cleaned.show(10))
    print(df_cleaned.count())
    return df_cleaned

def call_para_cleanup(df):
    para_cleanup_udf=udf(para_segmenter_and_cleanup, StringType())
    df_cleaned = df.withColumn('para_cleaned_text', para_cleanup_udf(df['speech_text_utf'])).drop(df['speech_text_utf'])
    print(df_cleaned.printSchema())
    print(df_cleaned.show(10))
    print(df_cleaned.count())
    return df_cleaned

def call_ngrams(df, n):
    ngrams_udf=udf(lambda tkn: return_n_grams(tkn,n),ArrayType(StringType()))
    #ngrams_udf=udf(return_n_grams(n), ArrayType(StringType()))
    col_label=str(n)+"grams"
    print(col_label)
    ngram_value = n
    ngram_broadcasted = sc.broadcast(n)
    df_with_ngrams = df.withColumn(col_label, ngrams_udf(df['para_cleaned_text']))
    print(df_with_ngrams.printSchema())
    print(df_with_ngrams.select(col_label).show(3))
    print(df_with_ngrams.count())
    return df_with_ngrams

def call_speech_vocab(df):
    para_cleanup_udf=udf(speech_vocabulary, FloatType())
    df_with_vocab_score = df.withColumn('vocab_score', para_cleanup_udf(df['para_cleaned_text']))
    print(df_with_vocab_score.printSchema())
    print(df_with_vocab_score.show(3))
    print(df_with_vocab_score.count())
    return df_with_vocab_score


In [9]:
df_utf=call_utf_encoder(df)


root
 |-- speaker: string (nullable = true)
 |-- speech_id: long (nullable = true)
 |-- speech_text_utf: string (nullable = true)

None
+-------+---------+--------------------+
|speaker|speech_id|     speech_text_utf|
+-------+---------+--------------------+
|  Obama|        1|The President. He...|
|  Obama|        2|The President. He...|
|  Obama|        3|The President. He...|
|  Obama|        4|The President. He...|
|  Obama|        5|The President. He...|
|  Obama|        6|The President. He...|
|  Obama|        7|The President. Ho...|
|  Obama|        8|The President. He...|
|  Obama|        9|The President. He...|
|  Obama|       10|The President. He...|
+-------+---------+--------------------+
only showing top 10 rows

None
181


In [10]:
df_cleaned=call_para_cleanup(df_utf)

root
 |-- speaker: string (nullable = true)
 |-- speech_id: long (nullable = true)
 |-- para_cleaned_text: string (nullable = true)

None
+-------+---------+--------------------+
|speaker|speech_id|   para_cleaned_text|
+-------+---------+--------------------+
|  Obama|        1| Hello, Parma! He...|
|  Obama|        2| Hello, Sandusky!...|
|  Obama|        3| Hey! Hello, Ohio...|
|  Obama|        4| Hello, Pittsburg...|
|  Obama|        5| Hello, Ohio! How...|
|  Obama|        6| Hello, Cedar Rap...|
|  Obama|        7| How's it going, ...|
|  Obama|        8| Hello, Hampton! ...|
|  Obama|        9| Hello, Roanoke! ...|
|  Obama|       10| Hello, Virginia ...|
+-------+---------+--------------------+
only showing top 10 rows

None
181


In [11]:
print(df_cleaned)
df_with_bigrams = call_ngrams(df_cleaned, 2)
df_with_trigrams = call_ngrams(df_with_bigrams, 3)
df_with_4grams = call_ngrams(df_with_trigrams, 4)
df_with_5grams = call_ngrams(df_with_4grams, 4)
df_with_6grams = call_ngrams(df_with_5grams, 4)
df_with_vocab_score = call_speech_vocab(df_with_6grams)

DataFrame[speaker: string, speech_id: bigint, para_cleaned_text: string]
2grams
root
 |-- speaker: string (nullable = true)
 |-- speech_id: long (nullable = true)
 |-- para_cleaned_text: string (nullable = true)
 |-- 2grams: array (nullable = true)
 |    |-- element: string (containsNull = true)

None
+--------------------+
|              2grams|
+--------------------+
|[sit down, right ...|
|[happy fourth, or...|
|[everybody who's,...|
+--------------------+
only showing top 3 rows

None
181
3grams
root
 |-- speaker: string (nullable = true)
 |-- speech_id: long (nullable = true)
 |-- para_cleaned_text: string (nullable = true)
 |-- 2grams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 3grams: array (nullable = true)
 |    |-- element: string (containsNull = true)

None
+--------------------+
|              3grams|
+--------------------+
|[ohio's middle cl...|
|[grilling here hu...|
|[everybody who's ...|
+--------------------+
only showing top 3 rows

N

In [12]:
def tf_feature_vectorizer(df,no_of_features,ip_col):
    #from pyspark.sql.functions import udf
    #from pyspark.sql.types import *
    output_raw_col = ip_col+"raw_features"
    output_col = ip_col+"features"
    hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
    featurizedData = hashingTF.transform(df)
    idf = IDF(inputCol=output_raw_col, outputCol=output_col)
    idfModel = idf.fit(featurizedData)
    rescaled_data = idfModel.transform(featurizedData)
    rescaled_data.show(5)
    %time rescaled_data.count()
    return rescaled_data


In [13]:
df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')

+-------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+
|speaker|speech_id|   para_cleaned_text|              2grams|              3grams|              4grams|vocab_score|  2gramsraw_features|      2gramsfeatures|
+-------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+
|  Obama|        1| Hello, Parma! He...|[sit down, right ...|[ohio's middle cl...|[ohio's middle cl...| 0.29231918|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|
|  Obama|        2| Hello, Sandusky!...|[happy fourth, or...|[grilling here hu...|[pretty good gril...| 0.29399884|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|
|  Obama|        3| Hey! Hello, Ohio...|[everybody who's,...|[everybody who's ...|[even though poli...| 0.31220722|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|
|  Obama|        4| Hello, Pittsburg...|[favorite pe

In [14]:
df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')


+-------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
|speaker|speech_id|   para_cleaned_text|              2grams|              3grams|              4grams|vocab_score|  2gramsraw_features|      2gramsfeatures|  3gramsraw_features|      3gramsfeatures|
+-------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
|  Obama|        1| Hello, Parma! He...|[sit down, right ...|[ohio's middle cl...|[ohio's middle cl...| 0.29231918|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,5,7...|(100,[0,1,2,3,5,7...|
|  Obama|        2| Hello, Sandusky!...|[happy fourth, or...|[grilling here hu...|[pretty good gril...| 0.29399884|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,7...|(100,[0,1,2,3,4,7...|


In [16]:
df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')


+-------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|speaker|speech_id|   para_cleaned_text|              2grams|              3grams|              4grams|vocab_score|  2gramsraw_features|      2gramsfeatures|  3gramsraw_features|      3gramsfeatures|  4gramsraw_features|      4gramsfeatures|
+-------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  Obama|        1| Hello, Parma! He...|[sit down, right ...|[ohio's middle cl...|[ohio's middle cl...| 0.29231918|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,4,5...|(100,[0,1,2,3,5,7...|(100,[0,1,2,3,5,7...|(100,[5,7,10,11,1...|(100,[5,7,10,11,1...|
|  Obama|        2| Hello, Sandu

In [17]:
assembler = VectorAssembler(
    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
    outputCol="features")
assembler_output = assembler.transform(df_with_4grams_idf_vectors)
output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
print(output.show())
%time print(output.count())

+-------+---------+--------------------+--------------------+
|speaker|speech_id|   para_cleaned_text|            features|
+-------+---------+--------------------+--------------------+
|  Obama|        1| Hello, Parma! He...|[0.23465884427050...|
|  Obama|        2| Hello, Sandusky!...|[0.10056807611593...|
|  Obama|        3| Hey! Hello, Ohio...|[0.40227230446372...|
|  Obama|        4| Hello, Pittsburg...|[0.03352269203864...|
|  Obama|        5| Hello, Ohio! How...|[0.33522692038643...|
|  Obama|        6| Hello, Cedar Rap...|[0.33522692038643...|
|  Obama|        7| How's it going, ...|(301,[1,3,5,10,14...|
|  Obama|        8| Hello, Hampton! ...|[0.13409076815457...|
|  Obama|        9| Hello, Roanoke! ...|[0.23465884427050...|
|  Obama|       10| Hello, Virginia ...|[0.30170422834779...|
|  Obama|       11| Thank you! Hello...|[0.26818153630914...|
|  Obama|       12| How's it going, ...|[0.23465884427050...|
|  Obama|       13| Hello, Texas! It...|[0.06704538407728...|
|  Obama

In [21]:
output_tordd = output.rdd
train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
train_df = train_rdd.toDF()
test_df = test_rdd.toDF()
print(train_df)
print(test_df)

DataFrame[speaker: string, speech_id: bigint, para_cleaned_text: string, features: vector]
DataFrame[speaker: string, speech_id: bigint, para_cleaned_text: string, features: vector]


In [22]:
print('Train DF - Count: ')
print(train_df.count())
print('Test DF - Count: ')
print(test_df.count())

Train DF - Count: 
150
Test DF - Count: 
31


In [23]:
print("Initializing RF Model")
labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
pipeline = Pipeline(stages=[labelIndexer,rf])
%time model = pipeline.fit(output)
print("Completed RF Model")



Initializing RF Model
Wall time: 4min 27s
Completed RF Model


In [24]:
% time predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
rfModel = model.stages[1]
print(rfModel)  # summary only
print("Predictions: ")
print(predictions.show())

Wall time: 117 ms
Test Error = 0
RandomForestClassificationModel (uid=rfc_b3d29d5626ad) with 1000 trees
Predictions: 
+-------+---------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|speaker|speech_id|   para_cleaned_text|            features|indexedLabel|       rawPrediction|         probability|prediction|
+-------+---------+--------------------+--------------------+------------+--------------------+--------------------+----------+
|  Obama|       11| Thank you! Hello...|[0.26818153630914...|         0.0|[941.085759539758...|[0.94108575953975...|       0.0|
|  Obama|       16| Hello, everybody...|[0.20113615223186...|         0.0|[988.110050454478...|[0.98811005045447...|       0.0|
|  Obama|       19| How's it going, ...|[0.10056807611593...|         0.0|[940.087842322247...|[0.94008784232224...|       0.0|
|  Obama|       22| How's it going, ...|[0.23465884427050...|         0.0|[887.943736448105...|[0.88794373644810..

In [156]:
x=para_vocabulary(text)
x

0.28950485666903575

In [146]:
s=float(1222/4221)
s

0.0

In [303]:
ngram_value = 4
ngram_broadcasted = sc.broadcast(ngram_value)
ngram_broadcasted.unpersist


<bound method Broadcast.unpersist of <pyspark.broadcast.Broadcast object at 0x000000001FA18438>>

In [304]:
ngram_broadcasted.value

4

In [13]:
x=return_n_grams(text, 4)
x

4


["ohio's middle class every",
 'middle class every single',
 'class every single day',
 'every single day betty',
 'single day betty sutton',
 'state senate seat came',
 "lot like ohio it's",
 'first race way back',
 "seemed like wages weren't",
 "like wages weren't going",
 'basic bargain felt like',
 'auto industry almost went',
 "let's let detroit go",
 "we've seen manufacturing start",
 'pulling us back laughter',
 'two fundamentally different visions',
 'wall street banks anymore',
 "worst financial crisis we've",
 "let's stop giving tax",
 'stop giving tax breaks',
 "you've got health insurance",
 'got health insurance right',
 "insurance companies can't drop",
 "can't impose lifetime limits",
 'impose lifetime limits insurance',
 'lifetime limits insurance companies',
 "limits insurance companies can't",
 "insurance companies can't impose",
 "companies can't impose lifetime",
 "can't impose lifetime limits",
 "we're importing less oil",
 "romney sure doesn't need",
 'tougher tim

In [99]:
paras = para_segmenter_and_cleanup(text)
print(paras)

 Hello, Parma! Hello, Ohio! Well, it is good to be in Parma. Everybody, have a seat who can sit down.  It is good to be here. And this now, this is the right time for a rally. Now things have cooled off a little bit. And I had a beer in Amherst, at Ziggy's, so I'm feeling good, feeling steady.
I want to first of all I want everybody to give Wendy a big round of applause for the introduction. We've got some outstanding Ohioans in the house. First of all, the mayor of Parma, Tim DeGeeter is here. Stand up, Tim, so everybody can see you. There you go. I don't know who the guy holding the mayor is, but  nah.
 Oh, that's Jack!  Got it.
You guys have some of the best Members of Congress from this area. And nobody is fighting harder than your outstanding Senator, Sherrod Brown. And his outstanding wife Connie, who I love. We love Connie. Although, Connie, I did tell Sherrod he can take his tie off. I mean, it's 8 o'clock; we're in a park; it's hot.  But anyway, you keep on working on him.
Two

In [12]:
ngrams_list = return_n_grams(paras,2)
ngrams_list

[('Hello,', 'Parma!'),
 ('Parma!', 'Hello,'),
 ('Hello,', 'Ohio!'),
 ('Ohio!', 'Well,'),
 ('Well,', 'it'),
 ('it', 'is'),
 ('is', 'good'),
 ('good', 'to'),
 ('to', 'be'),
 ('be', 'in'),
 ('in', 'Parma.'),
 ('Parma.', 'Everybody,'),
 ('Everybody,', 'have'),
 ('have', 'a'),
 ('a', 'seat'),
 ('seat', 'who'),
 ('who', 'can'),
 ('can', 'sit'),
 ('sit', 'down.'),
 ('down.', 'It'),
 ('It', 'is'),
 ('is', 'good'),
 ('good', 'to'),
 ('to', 'be'),
 ('be', 'here.'),
 ('here.', 'And'),
 ('And', 'this'),
 ('this', 'now,'),
 ('now,', 'this'),
 ('this', 'is'),
 ('is', 'the'),
 ('the', 'right'),
 ('right', 'time'),
 ('time', 'for'),
 ('for', 'a'),
 ('a', 'rally.'),
 ('rally.', 'Now'),
 ('Now', 'things'),
 ('things', 'have'),
 ('have', 'cooled'),
 ('cooled', 'off'),
 ('off', 'a'),
 ('a', 'little'),
 ('little', 'bit.'),
 ('bit.', 'And'),
 ('And', 'I'),
 ('I', 'had'),
 ('had', 'a'),
 ('a', 'beer'),
 ('beer', 'in'),
 ('in', 'Amherst,'),
 ('Amherst,', 'at'),
 ('at', "Ziggy's,"),
 ("Ziggy's,", 'so'),
 ('so'

In [13]:
cleaned_text = clean_up(text)
cleaned_text

"\n Hello, Parma! Hello, Ohio! Well, it is good to be in Parma. Everybody, have a seat who can sit down.  It is good to be here. And this now, this is the right time for a rally. Now things have cooled off a little bit. And I had a beer in Amherst, at Ziggy's, so I'm feeling good, feeling steady.\nI want to first of all I want everybody to give Wendy a big round of applause for the introduction. We've got some outstanding Ohioans in the house. First of all, the mayor of Parma, Tim DeGeeter is here. Stand up, Tim, so everybody can see you. There you go. I don't know who the guy holding the mayor is, but  nah.\n\nAudience member. That's Jack!\n\n Oh, that's Jack!  Got it.\n\nYou guys have some of the best Members of Congress from this area. And nobody is fighting harder than your outstanding Senator, Sherrod Brown. And his outstanding wife Connie, who I love. We love Connie. Although, Connie, I did tell Sherrod he can take his tie off. I mean, it's 8 o'clock; we're in a park; it's hot.  

In [5]:
text = '''
The President. Hello, Parma! Hello, Ohio! Well, it is good to be in Parma. Everybody, have a seat who can sit down. [Laughter] It is good to be here. And this now, this is the right time for a rally. Now things have cooled off a little bit. And I had a beer in Amherst, at Ziggy's, so I'm feeling good, feeling steady.\nI want to first of all I want everybody to give Wendy a big round of applause for the introduction. We've got some outstanding Ohioans in the house. First of all, the mayor of Parma, Tim DeGeeter is here. Stand up, Tim, so everybody can see you. There you go. I don't know who the guy holding the mayor is, but [laughter] nah.\n\nAudience member. That's Jack!\n\nThe President. Oh, that's Jack! [Laughter] Got it.\n\nYou guys have some of the best Members of Congress from this area. And nobody is fighting harder than your outstanding Senator, Sherrod Brown. And his outstanding wife Connie, who I love. We love Connie. Although, Connie, I did tell Sherrod he can take his tie off. I mean, it's 8 o'clock; we're in a park; it's hot. [Laughter] But anyway, you keep on working on him.\n\nTwo outstanding Congresswomen, who are looking after Ohio's middle class every single day Betty Sutton is here, and Marcy Kaptur is here. And I could not be prouder to have, as one of my campaign cochairs, your former Governor: The outstanding Ted Strickland is here.\n\nAudience member. And an outstanding President!\n\nThe President. Oh, and an outstanding President, okay. [Laughter]\n\nNow, I just want to first of all say, thank you, guys, for taking your time to come out. I hope everybody had a wonderful Fourth of July. We had a little barbecue in my backyard. [Laughter] It was little, had a few fireworks. Some of you know that Malia turned 14 yesterday. And she is just an incredible young lady, just like Sasha is. Now, she used to be young enough where I could convince her that all these fireworks were for her birthday. But she doesn't believe me anymore. [Laughter]\n\nAnd Michelle sends her love. Malia was having a sleepover with some of her friends, and Michelle thought, you can't just have a house full of girls and no parental supervision. [Laughter] So just letting you know. But she says hi; the girls say hi; and Bo say hi.\n\nNow, you may not have noticed, but we're in the middle of campaign season here. And this will be one way or another, this will be my last campaign, which gets me to thinking about my first campaign. And I was a lawyer, and I was teaching law, and this seat the State senate seat came up. And I told Michelle, some people had talked to me about running for office, what did she think. And she said, well, that's a dumb idea. [Laughter]\n\nBut after I explained to her why I thought it might make sense for me to run, she joined in. And we didn't have a budget; we didn't have TV ads. We printed a bunch of stuff at Kinko's [laughter] and we had a few friends who volunteered. And we started knocking on doors, and I'd go in front of the grocery store, and I'd shake hands. And we would march in parades. In fact, the Fourth of July parade in Hyde Park, back in Chicago, they had a tradition where they made folks who were in office or running for office dress up. And somebody had an outfit for me. It was sort of like a minuteman outfit with, like, a hat and sort of the cutoff pants. And my legs are kind of skinny, so I didn't look very good in them. [Laughter]\n\n[At this point, an audience member whistled.]\n\nThe President. And then I won that race, and so I served in the State senate. And then I got the idea of running for the United States Senate after serving 8 years in the State senate. And I decided to go to Michelle and ask her what she thought. And she said, well, that's a dumb idea. [Laughter] But because I had had a chance as a State senator to travel the whole State of Illinois, which is a lot like Ohio it's a mix of big cities and rural communities and folks from every walk of life and I started just traveling the State. And I'd go to State fairs, and I'd we'd go to county fairs, and we'd stop in little towns and meet folks and go to VFW halls.\n\nAnd what I realized during that Senate race, and what I realized when I first ran for the State senate, was the reason I got into politics was because, in this country, there is this core American idea that we celebrated yesterday. And that is if you work hard, if you take responsibility for yourself and your family, and you don't get discouraged when you hit some setbacks, you can make it if you try. The basic American bargain that says, it doesn't matter what you look like, where you come from, where you worship, the idea is that you don't have to be born into fame or fortune. If you're willing to apply yourself and work hard, you can make it. You can follow your dreams.\n\nAnd the reason I got into that first race way back when and the reason I ran for the Senate and, ultimately, the reason I ran for President was because that had been available for my family. My grandparents participated in World War II. My grandfather fought in Patton's Army, and my grandmother worked on a bomber assembly line. But when my grandfather came back, he was able to study on the GI bill, and they were able to buy their first home with some help from the FHA. America gave them that opportunity, didn't give them a handout, but gave them that chance. And when my mom, a single mom, was raising two kids and trying to get her education, there were grants and loans available; and she could instill a love of learning in me and my sister, and we had a chance to get a great education.\n\nAnd you look at Michelle's family: Her dad was what was called a stationary engineer at the water filtration plant in Chicago, a blue-collar worker. Even though he had MS, he'd have to wake up an hour earlier than everybody else to get dressed and get to the job, but he never missed a day of work. And Michelle's mom, she stayed at home raising the kids and then later became a secretary. They never had a lot, but they had a chance to give their kids this great education.\n\nAnd so my whole life, and Michelle's whole life, was an example of this American Dream: this idea that if you work hard, you can find a job that pays a living wage and you can afford a home, you won't go bankrupt when you get sick, you can retire with some dignity and respect, you can take a vacation. It may not be fancy; you might not be going to some fancy resort, but you can go with your kids and enjoy each other's company and see the amazing sights of this country.\n\nAnd the problem was that you could feel, over the last decade, how that dream was slipping away for too many people. They were working harder and harder, but it seemed like wages weren't going up, incomes weren't going up, opportunities weren't increasing, the cost of health care was going up, the cost of college was going up, gas prices going up, groceries going up. So that basic bargain felt like it was slipping away from too many people.\n\nThat's what got me into politics: That's why I ran for the State senate; that's why I entered my first political race; that's why I'm running my last political race, because I want to make sure that the next generation, not just my kids, but everybody, has that same chance, that we get that bargain back for America's middle class. That's what Sherrod is fighting for and Marcy is fighting for and Betty is fighting for and I am fighting for. That's what you believe in. That's why I'm running for President of the United States of America.\n\nAudience members. Four more years! Four more years! Four more years!\n\nThe President. Now, this these past 3\ufffd years have made it tough on everybody. We saw the middle class struggling and folks who were trying to get into the middle class. I want to say, by the way, when I talk about middle class, I'm also talking about poor folks who are doing the right thing and trying to get into the middle class. And middle class is also an attitude. It's not just about income. It's about knowing what's important and not measuring your success just based on your bank account. But it's about your values and being responsible and looking after each other and giving back.\n\nAudience member. Giving back.\n\nThe President. Giving back.\n\nAnd it was tough even before the crisis hit. And then this crisis hit, and the auto industry almost went under, and people lost their jobs, and people lost their homes. And we've been working 24/7, 365 days a year, for the last 3\ufffd years to try to right the ship and recover.\n\nAnd we've seen progress.\n\nAudience member. It's working.\n\nThe President. We've seen progress. When some were saying, let's let Detroit go bankrupt, I said let's bet on the American worker. And now that Chrysler plant is churning out some of the best cars in the world. And GM is back on top. And Ford is on the move.\n\nWe've seen manufacturing start to come back to Ohio. We've seen the unemployment rate drop. So we've made progress. But we all know we've got so much more work to do. There are too many folks still out of work, homes still underwater. Too many kids are still trying to figure out how to pay for their college education. But you know what: These challenges had been building up over decades, and we knew we weren't going to turn it around overnight.\n\nWhat we wanted to do was make sure that we started moving in the right direction: moving forward, not moving backwards. And we've been able to do that. We've been moving forwards. And frankly, we've been moving forwards without a lot of help from the other side. We've been kind of yanking them. They've been on our ankles and pulling us back [laughter] but we've been moving forward. But the truth is, there is so much more we could be doing. And the reason I'm so glad you're here today is because the only way we are going to keep moving forward is with you.\n\nNow, I know you probably are already sick of this election with all the commercials and all the nastiness that's out there and the foolishness and the misinformation and all the political reporting about polls and who's up and who's down. But look, I want you to understand: Nothing could be bigger right now than the choice you're about to make; the choice you're about to make. Because it's more than just being about two candidates or two political parties; this is about two fundamentally different visions of how we move forward.\n\nMr. Romney, his allies in Congress, they've got a particular view. They believe that if we cut taxes, for especially the wealthiest Americans, about $5 trillion on top of the Bush tax cuts, paid for by cutting education and cutting making Medicare a voucher program and cutting programs for our kids, that somehow if we do that and we eliminate regulations that we've put in place thanks to the work of these great members of Congress, so that we don't have taxpayer bailouts of Wall Street banks anymore that if we roll those back, that somehow all this is going to benefit you. That first, it will benefit wealthy investors, and then things will rain down on you and benefit you in some fashion. That's the theory.\n\nNow, let me just say, this is a coherent theory. You can see it on their web sites. They don't make a secret about what they're planning to do. The only problem is we tried it. We tried it for about 10 years right before I was elected as President of the United States, and it didn't work. It didn't make the middle class stronger. Job growth was sluggish. Your wages and your incomes did not go up. It didn't grow our economy the way it needed to. And it culminated in the worst financial crisis we've had since the Great Depression. So their theory was tried, but it's a theory.\n\nI've got a different theory. I think they're wrong. As Wendy said, I don't think we grow our economy from the top down. I think we grow the economy from the middle class out. I think, we grow the economy by making sure everybody has got a fair shot and everybody is doing their fair share and everybody is playing by the same set of rules.\n\nAnd so when I look at how do we move forward, I say, we're making progress in manufacturing. Let's make more progress. Let's stop giving tax breaks to companies that are shipping jobs overseas. Let's give them to companies that are investing right here in the United States of America.\n\nWhen I think about moving forward, I say, let's invest in advanced manufacturing in this 21st-century economy for us to make sure that we're at the forefront of advanced battery manufacturing so the next generation of cars are built here in America; investing in clean energy to make sure that solar panels and wind turbines are built here in the United States of America. Those are smart investments. That's how we move forward.\n\nWhen I think about moving forward, I think about how do we make sure that American young people are the best educated in the world. So I want to hire new teachers, especially in math and science, and I want to train them better and pay them better. And I want to give 2 million more people the opportunity to go to a community college and train for the jobs that exist right now. And I want to make college more affordable for young people and bring tuition down. That's how we move forward. That's my vision for the future.\n\nAnd yes, I believe that we should have a health care system that works for middle class families. And I am couldn't be prouder of the work that we have done in getting this health care law passed. And there is so much misinformation out there, so I just want to clear up a couple of things. If you've got health insurance right now, here's what this bill means: It means that insurance companies can't drop you for no reason or when you need it most; it means that your kids can stay on your health insurance plan until they're 26 and have really gotten a job that provides them benefits; it means that they can't impose lifetime limits insurance companies can't impose lifetime limits so that when you really need it, suddenly they say, we don't have any more insurance for you. It's a patient's bill of rights for you if you have insurance, and if you don't have health insurance, then it gives you a chance to buy into a pool so that you get the same deal as folks who are working at big companies do. And if you're a senior on Medicare, it means your prescription drug costs are going to be lower and we're going to close that donut hole that has hurt a lot of seniors.\n\nNow, I think that was the right thing to do. That's part of moving forward.\n\nWe need to have an American energy policy. We're producing more oil and gas than we have in a long time, and we're importing less oil from overseas than we have in the past. But we can keep on doing more, not only increasing production of traditional fuels, but in developing and inventing new fuels because we need to free ourselves from our dependence on foreign oil. And by the way, we can put people back to work in the process. That's moving forward.\n\nI want to rebuild America. I promised, I'd end the war in Iraq. I ended the war in Iraq. We're transitioning out of Afghanistan. We've taken on Al Qaida, and we killed bin Laden. And now I want to take half the money that we're no longer spending on war and use it to drive down our deficits. And I want to take the other half to start doing some nation-building here in Ohio. Do some nation-building here at home. Put people back to work rebuilding our roads and our bridges and our schools, laying broadband lines and high-speed rail. That's how we build America. That's my idea of moving forward.\n\nSo I've got a different vision, and nowhere is that vision, by the way, bigger than when it comes to how do we deal with our debt and our deficit. Biggest contributors to our debt and our deficit, in addition to this recession, were two tax cuts that weren't paid for and two wars run on a credit card. And Mr. Romney's proposal to deal with this is another $5 trillion of tax cuts that aren't paid for; or if they're paid for, on the backs of you. That's not a plan to deal with our deficits.\n\nSo what I've said is, look, we're going to get rid of programs that don't work. We don't want to waste money. We can't afford it; we don't have enough. And by the way, I'm not somebody who believes that every government program works. I don't think government can solve every problem. I don't think we can always help folks who don't want to help themselves. Now, I don't care how much money we spend on schools if parents aren't parenting.\n\nSo government can't do everything, but there are some things we have to do to grow the economy. So we've got to invest in education. We've got to invest in basic science and research. We've got to invest in infrastructure. We've got to make sure that Medicare and Social Security are there for our seniors.\n\nSo in addition to wise spending cuts, there's nothing wrong with asking the wealthy to pay a little more in taxes. Now, let me just say this: I don't need a tax cut; Mr. Romney sure doesn't need a tax cut. I mean, I don't want a tax cut if it means, suddenly, students are having a tougher time affording going to college. I don't want a tax cut if it means seniors have to pay something more for Medicare.\n\nAnd you know what, a lot of successful people agree with that. And the reason they do is because they remember what it was like when they weren't successful and somebody gave them a helping hand. And so I think a lot of people want to do the right thing. But we can't have a culture that just encourages selfishness and looking out for yourself and not looking out for anybody else, just looking out for you and not the next generation.\n\nSo there are two fundamentally different visions about how we move the country forward. And the great thing about our democracy is you get to be the tiebreaker. It's up to you. It's up to you to decide what vision makes more sense. Mr. Romney's vision, the vision of his Republican allies and some of the special interests in Washington, we've tried that vision, and it didn't work. My vision, the last time we tried it was when Bill Clinton was President. And we created 23 million jobs, we had a budget surplus, and we created a whole lot of millionaires to boot. Everybody did well, because we're in it together.\n\nAnd ultimately, that's what it comes down to: Do you believe that we're on our own, all of us, or do you believe we're in it together? See, when I think about what's made America great, it's been our rugged individualism and our willingness to take risks and people going out there and starting a small business that becomes a medium-sized business, becomes a big business, they start hiring. All those things have contributed.\n\nBut what has also made us great is there are some things we've understood we do together. We build the Hoover Dam and the Golden Gate Bridge, together. We take care of our veterans, together. We send them to school on the GI bill, together. We invest in basic research that creates the Internet, together. We send a man to the moon, together. We build the Interstate Highway System, together. We do these things not because it helps any one of us individually, but because it gives opportunity for all of us to succeed. That's what's at stake in this election. And you are the tiebreaker.\n\nNow, over the next 4 months, you will see a lot of stuff on TV. The other side is spending more money than we've ever seen before. I mean, you've got billionaires just writing $10 million checks. They just they're spending money like nobody's business. And all of them have the same message, all these ads, which basically is: The economy is bad, and it's Obama's fault. [Laughter] I mean, they've got variations on this theme, but it's the same theme every time.\n\nSometimes they say, it's Obama's fault because he thinks government is all the answer. Sometimes it's because, well, Obama, he doesn't have private sector experience making a lot of money like the other guy. Some of it is, well, he just thinks everything is fine or he is in over his head. But it's all the same theme.\n\nAnd I think a lot of people are looking at this. They're saying, we don't know how this is going to turn out because we've never seen a sitting President outspent like this, with all this money and all these negative attacks. And maybe it'll work.\n\nAudience members. No!\n\nThe President. But it's a plan to run a campaign. It's not a plan to put people back to work. Their plan is not a plan to grow the middle class.\n\nAnd when I think back to my first campaign, and I think back to that first senate campaign that I ran, what I always remember, what Michelle and I learned, was folks can spend a whole lot of money and they can run a whole bunch of negative attacks. But when the American people decide what's right; when ordinary people are reminded of what's best in us; when we remember what our values are and we're willing to fight for them; when we're willing to say, you know what, this is not just about me, but this is about my kids and my grandkids, and so I'm going to work as hard as I can and I'm going to talk to my friends and I'm going to talk to my neighbors, I'm going to talk to my coworkers and I know it's going to be frustrating and I know it's going to be hard sometimes and there are going to be setbacks, but I'm going to stay with it, and enough people start having that feeling, nothing can stop them. Nothing can stop you. I don't care how much the other side spends. You cannot be stopped once you have decided what is right and what is true.\n\nIn the last election, I told people, I'm not a perfect man and I certainly wasn't going to be a perfect President. But I said, I'd always tell you what I thought and I'd always tell you where I stood and I'd spend every single day fighting as hard as I knew how for you to try to make sure that every single one of you had the same chance as my family had because I saw myself in you. I saw my hopes and dreams in you. And when I see your kids, I see my kids. And when I see your grandparents, I see my grandparents. And I have kept that promise.\n\nAnd as long as I have the privilege of being your President, I will keep that promise because I still believe in you. And if you still believe in me, and if you're willing to stand with me and knock on some doors with me and make some phone calls with me, we will finish what we started in 2008. And we will grow this middle class, and we will strengthen America, and we'll remind the world just why it is that we live in the greatest nation on Earth.\n\nGod bless you, and God bless the United States of America! Thank you!"'''


import json
with open('result.json', 'w') as fp:
    json.dump(data, fp)