# Extrahieren von Text zum Social Media Archiv 
- https://democrats-intelligence.house.gov/social-media-content/
- ```pip install progressbar```

In [20]:
import textract
import re
import os
import pandas as pd
import progressbar
import time

## Mit einer Datei ausprobieren

- Einleseversuch

In [9]:
#Testing with one file
text = textract.process('files/P(1)0000054.pdf', method='pdfminer')
text = str(text).replace("\n\n", " ")
text = str(text).replace("\n", " ")
text

"b'Ad ID\\n\\n374\\n\\nAd Text\\n\\nJoin us because we care. Black matters.\\n\\nAd Landing Page\\n\\nhttps://www.facebook.com/Black-Matters-1579673598947501 /\\n\\nAd Targeting\\n\\nLocation: United States: Baltimore (+20 km) Maryland; St. Louis (+20 km)\\nMissouri\\nExcluded Connections: Exclude people who like Black Matters\\nAge: 18 - 65+\\nLanguage: English (UK) or English (US)\\nPlacements: News Feed on desktop computers or News Feed on mobile\\ndevices\\n\\nAd Impressions\\n\\n137\\n\\nAd Clicks\\n\\n0\\n\\nAd Spend\\n\\n44.87 RUB\\n\\nAd Creation Date\\n\\n06/10/15 02:59:53 AM PDT\\n\\nP(1)0000054\\n\\nRedactions Completed at the Direction of Ranking Member of the US House Permanent Select Committee on Intelligence\\x0cSuggested Page\\n\\nBlack Matters\\nSponsored\\n\\nJoin us because we care. Black matters.\\n\\nBlack Matters\\nCommun,ty\\n224,537 people like  nis.\\n\\n,` Like Page\\n\\nP(1)0000055\\n\\nRedactions Completed at the Direction of Ranking Member of the US House P

- Suchen nach dem Adtext:

In [10]:
adtext = re.search("Ad Text.*Ad Landing Page", text)
adtext = re.search("Ad Text.*Ad Landing Page", text).group().replace("Ad Landing Page", "")
adtext = adtext.replace("Ad Text", "").replace("\\n", " ").strip()
adtext

'Join us because we care. Black matters.'

In [25]:
def get_location(text):
    if re.search("Location.*(Age)", text) != None:
        pattern = re.search("Location.*(Age)", text)
        loc = pattern.group().replace('Age','').replace('Location - Living In: ','')
    else:
        loc = 'N/A'
    return loc
get_location(text)

'Location: United States: Baltimore (+20 km) Maryland; St. Louis (+20 km)\\nMissouri\\nExcluded Connections: Exclude people who like Black Matters\\n'

In [26]:
def get_date(text):
    if re.search("Creation Date.*(Redactions)", str(text)) != None:
        pattern = re.search("Creation Date.*(Redactions)", str(text))
        pattern = pattern.group().replace('Creation Date\\n\\n','')
        date = re.search("\d\d/\d\d/\d\d \d\d:\d\d:\d\d [A-Z][A-Z] [A-Z]{3}", pattern)
        if date != None:
            date = date.group()
    else: 
        date = 'N/A'
    return date
get_date(text)

'06/10/15 02:59:53 AM PDT'

In [13]:
def get_ad_klicks(text):
    if re.search("Ad Clicks.*", text) != None:
        klicks = re.search("Ad Clicks.*", text)
        klicks = klicks.group()[:25].replace(",", "")
        klicks = re.search("[0-9]+", klicks)
        if klicks != None:
            klicks = klicks.group()
    else:
        klicks = 'N/A'
    return klicks
get_ad_klicks(text)

'0'

In [14]:
def get_ad_impressions(text):
    if re.search("Ad Impressions.*", text) != None:
        imps = re.search("Ad Impressions.*", text)
        imps = imps.group()[:25].replace(",", "")
        imps = re.search("[0-9]+", imps)
        if imps != None:
            imps = imps.group()
    else:
        imps = 'N/A'
    return imps
get_ad_impressions(text)

'137'

In [15]:
def get_ad_spend(text):
    if re.search("Ad Spend.*", text) != None:
        adspend = re.search("Ad Spend.*", text)
        adspend = adspend.group()[:25].replace(",", "")
        adspend = re.search("[0-9]+.[0-9]+", adspend)
        if adspend != None:
            adspend = adspend.group()
    else:
        adspend = 'N/A'
    return adspend
get_ad_spend(text)

'44.87'

## Aufbau der Dateiliste

In [16]:
!ls

2.1 Importing Text from Everywhere.ipynb
2.2 Real world example with textract.ipynb
2.3 Named Entities Recognition.ipynb
2.4 Classifying Text.ipynb
2.5 OpenCV - Gesichtserkennung.ipynb
2.6 Document Similarity with TDF-IDF.ipynb
2.7 [OPTIONAL] Similar Words and Sentences with spaCy.ipynb
2.8 [OPTIONAL] Objekterkennung as a service mit IBM Vision.ipynb
2.9 [OPTIONAL] Sentiment, Spracherkennung, Spellchecking.ipynb
Wohnung.docx
bild.jpg
example.docx
example.pdf
example.png
[34mfiles[m[m
fruitbowl.jpg
godfather.txt
guardian.png
[34mhaarcascades[m[m
[34minstagram[m[m
obama.txt
readable.csv
schindlers_list.txt
shawnshank.txt
[34msongtexte[m[m
tf-idf.jpeg


In [21]:
#Creating file list
lt = sorted(os.listdir("files"))

In [22]:
lst = []
for elem in lt:
    if ".pdf" in elem:
        lst.append(elem)

In [23]:
#lst = lst[0:4]

In [27]:
#Creating dictionary for:

#'Location', 'Number', 'Date', 'Interests',
#'Age', 'Language', 'Placements', 'Ad Clicks',
#'Ad Impressions', 'Ad Spend'

bar = progressbar.ProgressBar()
results = []

for elem,i in zip(lst, bar(range(len(lst)-1))):
    #print(elem)
    if 'ultimate_names.csv' in elem:
        continue
    
    else:
        
        text = textract.process('files/'+elem, method='pdfminer')
        text = str(text).replace("\n\n", " ")
        text = str(text).replace("\n", " ")

    
        location = get_location(text)
        date = get_date(text)
        klicks = get_ad_klicks(text)
        imps = get_ad_impressions(text)
        adspend = get_ad_spend(text)
        
        
        if re.search("Interests:.*", text) != None:
            interests = re.search("Interests:.*", text).group().split('\\n')[0]
        else:
            interests = 'N/A'
    
        if re.search("Age:.*", text) != None:
            age = re.search("Age:.*", text).group().split('\\n')[0]
        else:
            age = 'N/A'
        
        if re.search("Language:.*", text) != None:
            lang = re.search("Language:.*", text).group().split('\\n')[0]
        else:
            lang = 'N/A'
    
        if re.search("Placements:.*", text) != None:
            place = re.search("Placements:.*", text).group().split('\\n')[0]
        else:
            place = 'N/A'
    
        if re.search("Ad Text.*Ad Landing Page", text) != None:
            adtext = re.search("Ad Text.*Ad Landing Page", text).group().replace("Ad Landing Page", "")
            adtext = adtext.replace("Ad Text", "").replace("\\n", " ").strip()
        else:
            place = 'N/A'
        
        mini_dict = {'Location': location,
                 'Number': elem,
                 'Date': date,
                 'Interests': interests,
                 'Age': age,
                 'Language': lang,
                 'Placements': place,
                 'Ad Text': adtext,
                 'Ad Clicks': klicks,
                 'Ad Impressions': imps,
                 'Ad Spend': adspend}
        
        results.append(mini_dict)


100% |########################################################################|


### Abspeichern im Dataframe

In [28]:
df = pd.DataFrame(results)

In [29]:
df

Unnamed: 0,Ad Clicks,Ad Impressions,Ad Spend,Ad Text,Age,Date,Interests,Language,Location,Number,Placements
0,0.0,137.0,44.87,Join us because we care. Black matters.,Age: 18 - 65+,06/10/15 02:59:53 AM PDT,,Language: English (UK) or English (US),Location: United States: Baltimore (+20 km) Ma...,P(1)0000054.pdf,Placements: News Feed on desktop computers or ...
1,35.0,452.0,184.81,NOT EVERY BOY WANTS TO BE A SOLDIER. A beauti...,Age: 18 - 65+,06/23/15 07:04:01 AM PDT,,,United States\n\n,P(1)0000180.pdf,Placements: News Feed on desktop computers or ...
2,,,,NOT EVERY BOY WANTS TO BE A SOLDIER. A beauti...,Age: 18 - 65+,,,,Location - Living In: United States \n\n,P(1)0000182.pdf,
3,0.0,31.0,33.59,?????? ??? ????? ? ??????????,Age: 18 - 65+,06/09/15 03:50:21 AM PDT,,Language: English (UK) or English (US),Location: United States\nExcluded Connections:...,P(1)0001844.pdf,"Placements: News Feed on desktop computers, Ne..."
4,4.0,326.0,45.94,California... knows how to party California......,Age: 18 - 65+,06/10/15 07:34:52 AM PDT,,,Location: United States: Baltimore Maryland; F...,P(1)0002111.pdf,Placements: News Feed on desktop computers or ...
5,517.0,1.0,99.97,"Since 2010, over 350 of our lives have been ta...",Age: 18 - 65+,06/12/15 03:13:16 AM PDT,,,Location: United States Baltimore Maryland: F...,P(1)0002112.pdf,Placements: News Feed on desktop computers or ...
6,7.0,125.0,34.77,"\'Just like Trayvon Martin, race mattered for ...",Age: 18 - 65+,06/11/15 06:51:30 AM PDT,,,Location: United States: Baltimore Maryland; F...,P(1)0002113.pdf,Placements: News Feed on desktop computers or ...
7,17.0,168.0,31.54,Race war started by Texas teacher A Texas fou...,Age: 18 - 65+,06/11/15 07:03:58 AM PDT,,,Location: United States: Baltimore Maryland; F...,P(1)0002114.pdf,Placements: News Feed on desktop computers or ...
8,18.0,482.0,90.65,The image of 1938 shows several African Americ...,Age: 18 - 65+,06/15/15 07:21:33 AM PDT,,,Location: United States: Baltimore Maryland; F...,P(1)0002115.pdf,Placements: News Feed on desktop computers or ...


### Abspeichern als CSV

In [30]:
df.to_csv('readable.csv')