In [1]:
import pandas as pd
import os , re

In [2]:
texts = []
filenames = []

for filename in os.listdir('./data/output_txt/'):
    if filename.endswith('.txt'):
        
        with open(f'./data/output_txt/{filename}') as f:
            text = f.read()
            texts.append(text)
            
        filename = filename[:-4]
        filenames.append(filename)
 

In [3]:
dct={}
dct['text'] = texts
dct['filename'] = filenames
df = pd.DataFrame(dct)
df.head()

Unnamed: 0,text,filename
0,PROCEEDINGS OF THE\n YORKSHIR\nE GEOLOGICAL\n ...,Cox et al 1987
1,TriassicpalynologyofcentralandnorthwesternEuro...,Kuerschner & Herngreen 2010
2,"JournalofSedimentaryResearch,2017,v.87,1226Œ12...",Gani 2017
3,"\n\n !\nˇ\n&ˇ\n\nˆˆˆ˙\n\n\n#˝˚#$#˚0\n\n""ˆ3˙\n)...",Iakovleva Brinkhuis & Cavagnetto 2001
4,"34Z FIELD EXCURSION NO. 10, NOVEMBER 11, 196Z ...",Wilson 1962


### Checking for duplicte files - looks good.

In [4]:
len(df)

1830

In [5]:
df['filename'].nunique()

1830

### Adding column of year document published

In [6]:
def get_year(cell):
        year = ((re.sub('\D', '', cell)))
        return year[:4]
    
df['year'] = df['filename'].map(get_year)    
df['year'].dtype

dtype('O')

In [7]:
df[df['year'] == '']

Unnamed: 0,text,filename,year
94,BASTROP COUNTY \n- 8 E. H. SELLARDS Bastrop Co...,Sellards,
112,\n1\n \nDIFFERENCES\n \nIN THE CHEMICAL COMPO...,Bogus et al,
901,The Catchment-Integrated Response to Rapid Cli...,Sharman et al Wilcox-PETM,
1545,Pages 51-61 \nAssemblages or Associations \n\...,Nichols text,
1632,GULF COASTASSOCIATION OF GEOLOGICALSOCIETIES T...,Breyer,


In [8]:
# Impute year so it can be made an integer 
df['year'] = df['year'].replace('', '0001')
df[df['year'] == '']

Unnamed: 0,text,filename,year


In [9]:
df['year'] = df['year'].astype(int)
df['year'].dtype

dtype('int64')

### Splitting data into train and test sets based on year

In [10]:
def split_years(year):
    if 1995 >= year <= 2019: 
            return 'test'
    else:
            return 'train'
    
df['split'] = df['year'].map(split_years)    

In [11]:
df.head()

Unnamed: 0,text,filename,year,split
0,PROCEEDINGS OF THE\n YORKSHIR\nE GEOLOGICAL\n ...,Cox et al 1987,1987,test
1,TriassicpalynologyofcentralandnorthwesternEuro...,Kuerschner & Herngreen 2010,2010,train
2,"JournalofSedimentaryResearch,2017,v.87,1226Œ12...",Gani 2017,2017,train
3,"\n\n !\nˇ\n&ˇ\n\nˆˆˆ˙\n\n\n#˝˚#$#˚0\n\n""ˆ3˙\n)...",Iakovleva Brinkhuis & Cavagnetto 2001,2001,train
4,"34Z FIELD EXCURSION NO. 10, NOVEMBER 11, 196Z ...",Wilson 1962,1962,test


In [12]:
df.groupby('split').count()

Unnamed: 0_level_0,text,filename,year
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,487,487,487
train,1343,1343,1343


In [13]:
487/len(df)

0.26612021857923496

In [14]:
df.head(15)

Unnamed: 0,text,filename,year,split
0,PROCEEDINGS OF THE\n YORKSHIR\nE GEOLOGICAL\n ...,Cox et al 1987,1987,test
1,TriassicpalynologyofcentralandnorthwesternEuro...,Kuerschner & Herngreen 2010,2010,train
2,"JournalofSedimentaryResearch,2017,v.87,1226Œ12...",Gani 2017,2017,train
3,"\n\n !\nˇ\n&ˇ\n\nˆˆˆ˙\n\n\n#˝˚#$#˚0\n\n""ˆ3˙\n)...",Iakovleva Brinkhuis & Cavagnetto 2001,2001,train
4,"34Z FIELD EXCURSION NO. 10, NOVEMBER 11, 196Z ...",Wilson 1962,1962,test
5,"N.z. Journal of Geology and Geophysics, Vol. 1...",Wilson 1976,1976,test
6,GEOLOGY | July 2014 \n| www.gsapubs.org \n607I...,Blum & Pecha 2014,2014,train
7,MesozoicandCenozoicSequenceStratigraphyofEurop...,Michelsen et al 1998,1998,train
8,REPLY ESTUARINE FACIES MODELS: CONCEPTUAL BASI...,Dalrymple et al 1994,1994,test
9,OntheglobaldistributionofLateCretaceousOstraco...,Puckett 2009,2009,train


In [15]:
df.shape

# 2129 total documents

(1830, 4)

### Cleaning text up a bit

In [16]:
# What does a document look like? 
df.iloc[10,0]

'Copyright2008,SEPM(SocietyforSedimentaryGeology)0883-1351/08/0023-0223/$3.00\nPALAIOS,2008,v.23,p.223Œ232\nResearchArticle\nDOI:10.2110/palo.2006.p06-127r\nDEEPBURROWSINSUBMARINEFAN-CHANNELDEPOSITSOFTHECERROTOROFORMATION\n(CRETACEOUS),CHILEANPATAGONIA:IMPLICATIONSFORFIRMGROUNDDEVELOPMENT\nANDCOLONIZATIONINTHEDEEPSEA\nSTEPHENM.HUBBARD\n1*andMICHAELR.SHULTZ\n21UniversityofCalgary,DepartmentofGeoscience,Calgary,Alberta,T2N1N4,Canada;\n2ChevronEnergyTechnologyCompany,6001BollingerCanyonRoad,\nSanRamon,California,94583-2324,USA\ne-mail:steve.hubbard@ucalgary.ca\nABSTRACTTheGlossifungitesichnofaciesrecognizedinCretaceousstrata(Cerro\nToroFormation)oftheMagallanesforelandbasininsouthernChile\n\nrepresentsanimportantdiscoveryinthatitextendsthestratigraphic\nutilityof˚rmgroundtrace-fossilsuitesintothick-bedded,gravity-\n˜owdepositsofsubmarinefan-channelenvironments.Thetrace-\n\nfossilsuiteconsistsofatypicallylarge\nDiplocraterion,Skolithos,and\nArenicolites,whichmayreachaninferredlengthof7m.Th

#### Documents look like they had a tough time reading in via pyPDFreader.
Many run on words and strange sentence breaks.
ie. "itcannotbeconfusedwithanyotherpollentypes"

Furture work - see if there is another reader that would work a bit better

In [17]:
# function to clean the text - from gensim.preprocessing module example
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
    gsp.strip_tags,     
    gsp.strip_punctuation,
    gsp.strip_numeric,
    gsp.remove_stopwords, 
    gsp.strip_short, 
    gsp.stem_text,
    gsp.strip_multiple_whitespaces 
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [18]:
#test it out first on one doc
#clean_text(df.iloc[10,0])

In [19]:
# Now do it for all documents 
df['text'] = df['text'].map(lambda x: clean_text(x))

In [20]:
# didn't work very well.. 
df['text'].head(15)

0     proceed yorkshir geolog societi vol upper jura...
1     triassicpalynologyofcentralandnorthwesterneuro...
2     journalofsedimentaryresearch currentrippl doi ...
3     ˆˆˆ˙ ˛kˆ c˝ˇ hh˛ hhˇ lk˘ hhˇ hhˇ˚ hhˇ d˜˛ hhˇ ...
4     field excurs novemb tertiari format austin hou...
5     journal geologi geophys vol note new zealand g...
6     geologi juli www gsapub org introductionearli ...
7     mesozoicandcenozoicsequencestratigraphyofeurop...
8     repli estuarin faci model conceptu basi strati...
9     ontheglobaldistributionoflatecretaceousostraco...
10    copyright sepm societyforsedimentarygeolog pal...
11    journalof sedimentari research vol arch copyri...
12    volbeda etal chem soc specialissueonhydrogenas...
13    originandcompositionoforganicmatterintidalﬂats...
14    tidal iwc cretaceoo flwial strata utah usa kei...
Name: text, dtype: object

In [21]:
df.to_pickle('assets/df_clean.pkl')

### Let's try again with Corpus specific stop words

In [48]:
import nltk
from nltk.corpus import stopwords
df = pd.read_pickle('assets/df_clean.pkl')

In [49]:

specific = ['figs', 'fig', 'et', 'al', 'pl','appendix','figure','cm', 'ft', 'sp'\
            , 'pp', 'iv', 'etal', 'ed', 'eds', 'http', 'ma', 'th', 'tion', 'ing',\
           'cf', 'ii', 'www', 'tions', 'strati', 'km', 'com', 'bulletin', 'doi', \
            'org', 'society','springer', 'verlag', 'pa', 'spec', 'pub', 'assoc',\
            'publication','university', 'press', 'geologists', 'geological',\
            'association', 'ph', 'comm', 'pers', 'geol', 'surv', 'bull',\
            'journal', 'soc', 'sci', 'letters', 'lett', 'geophys', 'res',\
            'acad', 'mar', 'acad', 'palaeobotany', 'palaeoclimatology', \
            'palaeogeography','societies', 'bureau', 'economic', 'prof',\
            'palaeoecology','paper', 'file', 'report', 'open', 'london',\
            'america', 'elsevier','amsterdam', 'sepm', 'earthplanet',\
           'paleoclimatol', 'palaeoecol', 'np', 'sc', 'palaeogeogr', 'palaeoclimatol',\
            'american', 'geo', 'rev', 'journal', 'und', 'review', 'samples',\
            'collected', 'allrightsreserved', 'clim', 'elsevierb', 'cosmochim',\
            'sciencereviews', 'levelchanges', 'ne', 'sepmspec', 'publ', 'acta',\
           'internationalassociationofsedimentologists', 'palaeobot', 'polynol',\
           'sedi', 'ment', 'deposi', 'tional', 'odp','copyright']


stop_words = nltk.corpus.stopwords.words('english')+ specific

In [50]:
from nltk.tokenize import word_tokenize 

def remove_stop(s):
    word_tokens = word_tokenize(s) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
           
    return ' '.join(filtered_sentence)
    

In [51]:
df['text'] = df['text'].map(lambda x: remove_stop(x))

In [52]:
df['text'].head(15)

0     proceed yorkshir geolog societi vol upper jura...
1     triassicpalynologyofcentralandnorthwesterneuro...
2     journalofsedimentaryresearch currentrippl jsr ...
3     ˆˆˆ˙ ˛kˆ c˝ˇ hh˛ hhˇ lk˘ hhˇ hhˇ˚ hhˇ d˜˛ hhˇ ...
4     field excurs novemb tertiari format austin hou...
5     geologi vol note new zealand geolog survei lat...
6     geologi juli gsapub introductionearli cretac s...
7     mesozoicandcenozoicsequencestratigraphyofeurop...
8     repli estuarin faci model conceptu basi strati...
9     ontheglobaldistributionoflatecretaceousostraco...
10    societyforsedimentarygeolog palaio researchart...
11    journalof sedimentari research vol arch copyri...
12    volbeda chem specialissueonhydrogenas pickett ...
13    originandcompositionoforganicmatterintidalﬂats...
14    tidal iwc cretaceoo flwial strata utah usa kei...
Name: text, dtype: object

### Save to file for posterity

In [53]:
df.to_pickle('assets/df_clean2.pkl')