# Additional string manipulation and regexpression examples

## Libraries and settings

In [1]:
# Libraries
import os
import re
import string
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

U:\Lektionen\DA_HS2022\KK\Week_13


## Create dataframe

In [2]:
# Create data frame 'df'
df = pd.DataFrame(data={'price_raw': ['CHF 12000', 'CHF 24000', 'CHF 54000'],
                        'kilometer_raw': ['xfz 120\'000 km', 'PLZ 8430, 85\'000 km', '45\'000 km xyz'], 
                        'description_raw': ['BMW i3', 'VW Caddy 1.4', 'FIAT Punto 1.2']})
df

Unnamed: 0,price_raw,kilometer_raw,description_raw
0,CHF 12000,xfz 120'000 km,BMW i3
1,CHF 24000,"PLZ 8430, 85'000 km",VW Caddy 1.4
2,CHF 54000,45'000 km xyz,FIAT Punto 1.2


## String manipulation & regular expression examples
If you use regexpressions, always test it using webpages like: https://regex101.com

### Variable Price

In [3]:
# Extract single values
for i in df['price_raw']:
    str01 = re.findall('[0-9]+', i)
    print(list(map(int, str01)))

[12000]
[24000]
[54000]


In [4]:
# Extract all values and write to new variable
price = []
for i in df['price_raw']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = list(map(int, price))

# Print first 5 values
print(df['price_raw'].head(), '\n')
print(df['price'].head())

0    CHF 12000
1    CHF 24000
2    CHF 54000
Name: price_raw, dtype: object 

0    12000
1    24000
2    54000
Name: price, dtype: int64


### Variable Kilometer

In [5]:
# Extract single values (here in one row, two values occur)
for i in df['kilometer_raw']:
    str01 = i.replace('\'', '')
    str02 = re.findall('[0-9]+', str01)
    print(str02)

['120000']
['8430', '85000']
['45000']


In [6]:
# If two values occur, extract only first
kilometer = []
for i in df['kilometer_raw']:
    str01 = i.replace('\'', '')
    str02 = re.findall('[0-9]+', str01)
    kilometer.append(str02[len(str02)-1])

# Write to data frame
df['kilometer'] = list(map(int, kilometer))
df

Unnamed: 0,price_raw,kilometer_raw,description_raw,price,kilometer
0,CHF 12000,xfz 120'000 km,BMW i3,12000,120000
1,CHF 24000,"PLZ 8430, 85'000 km",VW Caddy 1.4,24000,85000
2,CHF 54000,45'000 km xyz,FIAT Punto 1.2,54000,45000


### Variable Description

In [7]:
# Find empty spaces in strings
for i in df['description_raw']:
    pos = i.find(" ", 1)
    print(f'The first empty space occurs at position: {pos:.0f}')

The first empty space occurs at position: 3
The first empty space occurs at position: 2
The first empty space occurs at position: 4


In [8]:
# Now, use the information to extract the brand
for i in df['description_raw']:
    # Position of 1st empty space
    pos = i.find(" ", 1)
    # Substring [from:to]
    substr = i[0:pos]
    print(substr)

BMW
VW
FIAT
